From ee39a4154ab84763d141e02a31b4d085ee1babd2 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Wed, 21 Aug 2024 19:59:34 +0200 Subject: [PATCH 01/15] [MCP] Move unnecesary dependencies if they block copy propagation --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 278 +- .../AArch64/GlobalISel/arm64-atomic.ll | 10 +- .../AArch64/GlobalISel/arm64-pcsections.ll | 196 +- .../GlobalISel/merge-stores-truncating.ll | 3 +- llvm/test/CodeGen/AArch64/aarch64-mulv.ll | 3 +- llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll | 10 +- llvm/test/CodeGen/AArch64/addp-shuffle.ll | 6 +- .../CodeGen/AArch64/anti-dependencies-mcp.mir | 201 ++ .../CodeGen/AArch64/arm64-non-pow2-ldst.ll | 8 +- .../CodeGen/AArch64/arm64-subvector-extend.ll | 102 +- .../CodeGen/AArch64/arm64-windows-calls.ll | 3 +- llvm/test/CodeGen/AArch64/avoid-zero-copy.mir | 3 + llvm/test/CodeGen/AArch64/cgp-usubo.ll | 15 +- llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll | 12 +- .../CodeGen/AArch64/duplane-index-patfrags.ll | 12 +- llvm/test/CodeGen/AArch64/fcmp.ll | 18 +- llvm/test/CodeGen/AArch64/fexplog.ll | 510 ++-- llvm/test/CodeGen/AArch64/fpext.ll | 46 +- llvm/test/CodeGen/AArch64/fpow.ll | 56 +- llvm/test/CodeGen/AArch64/fpowi.ll | 102 +- llvm/test/CodeGen/AArch64/frem.ll | 56 +- llvm/test/CodeGen/AArch64/fsincos.ll | 204 +- .../test/CodeGen/AArch64/ldrpre-ldr-merge.mir | 152 +- llvm/test/CodeGen/AArch64/llvm.exp10.ll | 18 +- llvm/test/CodeGen/AArch64/load.ll | 3 +- .../AArch64/lr-reserved-for-ra-live-in.ll | 4 +- .../CodeGen/AArch64/machine-cp-sub-reg.mir | 6 +- .../AArch64/machine-sink-kill-flags.ll | 5 +- .../AArch64/named-vector-shuffles-neon.ll | 6 +- llvm/test/CodeGen/AArch64/neon-extadd.ll | 54 +- llvm/test/CodeGen/AArch64/neon-extmul.ll | 10 +- llvm/test/CodeGen/AArch64/neon-perm.ll | 3 +- llvm/test/CodeGen/AArch64/sext.ll | 81 +- llvm/test/CodeGen/AArch64/shufflevector.ll | 17 +- llvm/test/CodeGen/AArch64/spillfill-sve.mir | 112 +- .../streaming-compatible-memory-ops.ll | 5 +- llvm/test/CodeGen/AArch64/sve-sext-zext.ll | 27 +- .../sve-streaming-mode-fixed-length-trunc.ll | 4 +- .../AArch64/sve-vector-deinterleave.ll | 15 +- .../CodeGen/AArch64/sve-vector-interleave.ll | 6 +- llvm/test/CodeGen/AArch64/vec_umulo.ll | 14 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 49 +- llvm/test/CodeGen/AArch64/vselect-ext.ll | 17 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 18 +- llvm/test/CodeGen/AArch64/zext.ll | 81 +- llvm/test/CodeGen/ARM/addsubo-legalization.ll | 6 +- llvm/test/CodeGen/ARM/fpclamptosat_vec.ll | 43 +- llvm/test/CodeGen/ARM/funnel-shift.ll | 5 +- llvm/test/CodeGen/ARM/llvm.exp10.ll | 12 +- .../CodeGen/ARM/load-combine-big-endian.ll | 12 +- llvm/test/CodeGen/ARM/load-combine.ll | 8 +- llvm/test/CodeGen/ARM/sub-cmp-peephole.ll | 20 +- .../ARM/vecreduce-fadd-legalization-strict.ll | 34 +- llvm/test/CodeGen/ARM/vlddup.ll | 30 +- llvm/test/CodeGen/ARM/vldlane.ll | 31 +- llvm/test/CodeGen/RISCV/alu64.ll | 6 +- llvm/test/CodeGen/RISCV/branch-on-zero.ll | 6 +- llvm/test/CodeGen/RISCV/condops.ll | 18 +- llvm/test/CodeGen/RISCV/double-fcmp-strict.ll | 36 +- llvm/test/CodeGen/RISCV/float-fcmp-strict.ll | 18 +- llvm/test/CodeGen/RISCV/half-fcmp-strict.ll | 18 +- llvm/test/CodeGen/RISCV/llvm.frexp.ll | 30 +- llvm/test/CodeGen/RISCV/machine-cp.mir | 9 +- llvm/test/CodeGen/RISCV/neg-abs.ll | 10 +- llvm/test/CodeGen/RISCV/nontemporal.ll | 125 +- .../test/CodeGen/RISCV/overflow-intrinsics.ll | 13 +- llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll | 8 +- .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll | 2603 +++++++++++++++++ .../RISCV/rv64-statepoint-call-lowering.ll | 3 +- .../RISCV/rvv/constant-folding-crash.ll | 6 +- .../rvv/fixed-vectors-deinterleave-load.ll | 2 +- .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll | 42 +- .../RISCV/rvv/fixed-vectors-fmaximum.ll | 52 +- .../RISCV/rvv/fixed-vectors-fminimum-vp.ll | 42 +- .../RISCV/rvv/fixed-vectors-fminimum.ll | 52 +- .../rvv/fixed-vectors-masked-store-fp.ll | 8 +- .../rvv/fixed-vectors-masked-store-int.ll | 8 +- .../RISCV/rvv/fixed-vectors-reduction-fp.ll | 24 +- .../rvv/fixed-vectors-reduction-int-vp.ll | 24 +- .../test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll | 58 +- llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll | 52 +- .../test/CodeGen/RISCV/rvv/fminimum-sdnode.ll | 58 +- llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll | 52 +- .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll | 9 +- .../test/CodeGen/RISCV/rvv/mask-reg-alloc.mir | 2 +- .../CodeGen/RISCV/rvv/no-reserved-frame.ll | 7 +- .../RISCV/rvv/vector-deinterleave-fixed.ll | 2 +- .../CodeGen/RISCV/rvv/vector-deinterleave.ll | 2 +- llvm/test/CodeGen/RISCV/rvv/vmfeq.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmfge.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmfgt.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmfle.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmflt.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmfne.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/vmseq.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsge.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsgeu.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsgt.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsgtu.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsle.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsleu.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmslt.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsltu.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vmsne.ll | 30 +- llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll | 2 +- .../CodeGen/RISCV/rvv/vsetvli-regression.ll | 3 +- llvm/test/CodeGen/RISCV/rvv/vxrm.mir | 2 +- llvm/test/CodeGen/RISCV/shifts.ll | 6 +- llvm/test/CodeGen/RISCV/srem-vector-lkk.ll | 31 +- llvm/test/CodeGen/RISCV/tail-calls.ll | 14 +- .../CodeGen/RISCV/unaligned-load-store.ll | 3 +- llvm/test/CodeGen/RISCV/urem-vector-lkk.ll | 37 +- llvm/test/CodeGen/RISCV/wide-mem.ll | 3 +- ...lar-shift-by-byte-multiple-legalization.ll | 6 +- .../RISCV/wide-scalar-shift-legalization.ll | 6 +- llvm/test/CodeGen/RISCV/xaluo.ll | 81 +- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 2 +- llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir | 16 +- llvm/test/CodeGen/Thumb/smul_fix_sat.ll | 6 +- .../Thumb/umulo-128-legalisation-lowering.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-div-expand.ll | 17 +- llvm/test/CodeGen/Thumb2/mve-fmath.ll | 95 +- .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll | 32 +- .../CodeGen/Thumb2/mve-fptosi-sat-vector.ll | 40 +- .../CodeGen/Thumb2/mve-fptoui-sat-vector.ll | 42 +- llvm/test/CodeGen/Thumb2/mve-frint.ll | 24 +- .../CodeGen/Thumb2/mve-laneinterleaving.ll | 6 +- .../CodeGen/Thumb2/mve-sext-masked-load.ll | 9 +- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 24 +- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll | 50 +- llvm/test/CodeGen/Thumb2/mve-simple-arith.ll | 18 +- llvm/test/CodeGen/Thumb2/mve-vabdus.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-vcvt.ll | 8 +- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vld4.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vmovn.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vst4.ll | 4 +- .../CodeGen/Thumb2/mve-zext-masked-load.ll | 10 +- llvm/test/CodeGen/X86/apx/mul-i1024.ll | 19 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 2 +- .../CodeGen/X86/avx10_2_512ni-intrinsics.ll | 6 +- llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll | 12 +- llvm/test/CodeGen/X86/avx512-calling-conv.ll | 38 +- .../CodeGen/X86/avx512-gfni-intrinsics.ll | 60 +- .../test/CodeGen/X86/avx512-insert-extract.ll | 14 +- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 6 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 10 +- .../X86/avx512bw-intrinsics-upgrade.ll | 20 +- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 6 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 12 +- .../test/CodeGen/X86/avx512bwvl-intrinsics.ll | 30 +- .../X86/avx512vbmi2vl-intrinsics-upgrade.ll | 84 +- .../CodeGen/X86/avx512vbmi2vl-intrinsics.ll | 12 +- .../X86/avx512vl-intrinsics-upgrade.ll | 12 +- .../X86/div-rem-pair-recomposition-signed.ll | 2 +- .../div-rem-pair-recomposition-unsigned.ll | 2 +- .../element-wise-atomic-memory-intrinsics.ll | 6 +- .../CodeGen/X86/expand-vp-cast-intrinsics.ll | 3 +- llvm/test/CodeGen/X86/extract-bits.ll | 33 +- llvm/test/CodeGen/X86/icmp-abs-C-vec.ll | 6 +- llvm/test/CodeGen/X86/is_fpclass.ll | 6 +- llvm/test/CodeGen/X86/ldexp.ll | 3 +- llvm/test/CodeGen/X86/legalize-shl-vec.ll | 16 +- llvm/test/CodeGen/X86/matrix-multiply.ll | 60 +- llvm/test/CodeGen/X86/mul-i1024.ll | 2 +- llvm/test/CodeGen/X86/mul-i256.ll | 2 +- llvm/test/CodeGen/X86/mul-i512.ll | 2 +- .../X86/peephole-na-phys-copy-folding.ll | 2 +- llvm/test/CodeGen/X86/pmul.ll | 3 +- llvm/test/CodeGen/X86/pmulh.ll | 12 +- llvm/test/CodeGen/X86/pointer-vector.ll | 3 +- llvm/test/CodeGen/X86/pr11334.ll | 3 +- llvm/test/CodeGen/X86/pr34177.ll | 2 +- llvm/test/CodeGen/X86/pr61964.ll | 10 +- llvm/test/CodeGen/X86/shift-i128.ll | 8 +- llvm/test/CodeGen/X86/sibcall.ll | 3 +- llvm/test/CodeGen/X86/smul_fix.ll | 2 +- llvm/test/CodeGen/X86/smul_fix_sat.ll | 3 +- .../X86/smulo-128-legalisation-lowering.ll | 12 +- .../subvectorwise-store-of-vector-splat.ll | 30 +- llvm/test/CodeGen/X86/umul-with-overflow.ll | 7 +- llvm/test/CodeGen/X86/umul_fix.ll | 2 +- llvm/test/CodeGen/X86/umul_fix_sat.ll | 6 +- .../X86/umulo-128-legalisation-lowering.ll | 3 +- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 15 +- llvm/test/CodeGen/X86/vec_saddo.ll | 14 +- llvm/test/CodeGen/X86/vec_ssubo.ll | 5 +- llvm/test/CodeGen/X86/vec_umulo.ll | 29 +- llvm/test/CodeGen/X86/vector-interleave.ll | 6 +- .../vector-interleaved-load-i16-stride-2.ll | 4 +- .../vector-interleaved-load-i16-stride-3.ll | 16 +- .../vector-interleaved-load-i16-stride-4.ll | 4 +- .../vector-interleaved-load-i16-stride-5.ll | 26 +- .../vector-interleaved-load-i16-stride-6.ll | 20 +- .../vector-interleaved-load-i16-stride-7.ll | 49 +- .../vector-interleaved-load-i16-stride-8.ll | 64 +- .../vector-interleaved-load-i32-stride-3.ll | 44 +- .../vector-interleaved-load-i32-stride-4.ll | 40 +- .../vector-interleaved-load-i32-stride-5.ll | 133 +- .../vector-interleaved-load-i32-stride-6.ll | 51 +- .../vector-interleaved-load-i32-stride-7.ll | 60 +- .../vector-interleaved-load-i32-stride-8.ll | 170 +- .../vector-interleaved-load-i64-stride-4.ll | 12 +- .../vector-interleaved-load-i64-stride-5.ll | 40 +- .../vector-interleaved-load-i64-stride-6.ll | 16 +- .../vector-interleaved-load-i64-stride-7.ll | 164 +- .../vector-interleaved-load-i64-stride-8.ll | 464 +-- .../vector-interleaved-load-i8-stride-3.ll | 6 +- .../vector-interleaved-load-i8-stride-4.ll | 12 +- .../vector-interleaved-load-i8-stride-5.ll | 10 +- .../vector-interleaved-load-i8-stride-6.ll | 22 +- .../vector-interleaved-load-i8-stride-7.ll | 82 +- .../vector-interleaved-load-i8-stride-8.ll | 98 +- .../vector-interleaved-store-i16-stride-3.ll | 64 +- .../vector-interleaved-store-i16-stride-4.ll | 24 +- .../vector-interleaved-store-i16-stride-5.ll | 30 +- .../vector-interleaved-store-i16-stride-6.ll | 48 +- .../vector-interleaved-store-i16-stride-7.ll | 81 +- .../vector-interleaved-store-i32-stride-2.ll | 24 +- .../vector-interleaved-store-i32-stride-3.ll | 4 +- .../vector-interleaved-store-i32-stride-5.ll | 22 +- .../vector-interleaved-store-i32-stride-6.ll | 116 +- .../vector-interleaved-store-i32-stride-7.ll | 115 +- .../vector-interleaved-store-i32-stride-8.ll | 4 +- .../vector-interleaved-store-i64-stride-3.ll | 8 +- .../vector-interleaved-store-i64-stride-4.ll | 48 +- .../vector-interleaved-store-i64-stride-5.ll | 10 +- .../vector-interleaved-store-i64-stride-7.ll | 282 +- .../vector-interleaved-store-i64-stride-8.ll | 96 +- .../vector-interleaved-store-i8-stride-3.ll | 2 +- .../vector-interleaved-store-i8-stride-5.ll | 8 +- .../vector-interleaved-store-i8-stride-6.ll | 38 +- .../vector-interleaved-store-i8-stride-7.ll | 37 +- .../vector-interleaved-store-i8-stride-8.ll | 8 +- llvm/test/CodeGen/X86/vector-intrinsics.ll | 4 +- llvm/test/CodeGen/X86/vector-sext.ll | 15 +- .../X86/vector-shuffle-combining-avx.ll | 21 +- llvm/test/CodeGen/X86/vector-zext.ll | 6 +- .../X86/wide-scalar-shift-legalization.ll | 2 +- .../CodeGen/X86/x86-interleaved-access.ll | 10 +- llvm/test/CodeGen/X86/xmulo.ll | 134 +- 241 files changed, 6265 insertions(+), 4463 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index b34e0939d1c7c..493d7cd7d8c92 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -48,19 +48,27 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGInstrs.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" @@ -70,9 +78,15 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" +#include #include #include +#include +#include +#include +#include using namespace llvm; @@ -92,6 +106,113 @@ static cl::opt EnableSpillageCopyElimination("enable-spill-copy-elim", cl::Hidden); namespace { +// A ScheduleDAG subclass that is used as a dependency graph. +class ScheduleDAGMCP : public ScheduleDAGInstrs { +public: + void schedule() override { + llvm_unreachable("This schedule dag is only used as a dependency graph for " + "Machine Copy Propagation\n"); + } + + ScheduleDAGMCP(MachineFunction &MF, const MachineLoopInfo *MLI, + bool RemoveKillFlags = false) + : ScheduleDAGInstrs(MF, MLI, RemoveKillFlags) { + CanHandleTerminators = true; + } +}; + +static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, + SUnit *Src, + ScheduleDAGMCP &DG) { + MachineInstr *DstInstr = Dst->getInstr(); + MachineInstr *SrcInstr = Src->getInstr(); + MachineBasicBlock *MBB = SrcInstr->getParent(); + + if (DstInstr == nullptr || SrcInstr == nullptr) + return false; + assert("This function only operates on a basic block level." && + MBB == SrcInstr->getParent()); + + int SectionSize = + std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); + + // The bit vector representing the instructions in the section. + // This vector stores which instruction needs to be moved and which does not. + BitVector SectionInstr(SectionSize, false); + + // The queue for the breadth first search. + std::queue Edges; + + // Process the children of a node. + // Basically every node are checked before it is being put into the queue. + // A node is enqueued if it has no dependencies on the source of the copy + // (only if we are not talking about the destination node which is a special + // case indicated by a flag) and is located between the source of the copy and + // the destination of the copy. + auto ProcessSNodeChildren = [SrcInstr, &SectionSize, &SectionInstr]( + std::queue &Queue, + const SUnit *Node, bool IsRoot) -> bool { + for (llvm::SDep I : Node->Preds) { + SUnit *SU = I.getSUnit(); + MachineInstr &MI = *(SU->getInstr()); + if (!IsRoot && &MI == SrcInstr) + return false; + + int DestinationFromSource = + std::distance(SrcInstr->getIterator(), MI.getIterator()); + + if (&MI != SrcInstr && DestinationFromSource > 0 && + DestinationFromSource < SectionSize) { + // If an instruction is already in the Instructions to move map, than + // that means that it has already been processes with all of their + // dependence. We do not need to do anything with it again. + if (!SectionInstr[DestinationFromSource]) { + SectionInstr[DestinationFromSource] = true; + Queue.push(SU); + } + } + } + return true; + }; + + // The BFS happens here. + // + // Could not use the ADT implementation of BFS here. + // In ADT graph traversals we don't have the chance to select exactly which + // children are being put into the "nodes to traverse" queue or stack. + // + // We couldn't work around this by checking the need for the node in the + // processing stage. In some context it does matter what the parent of the + // instruction was: Namely when we are starting the traversal with the source + // of the copy propagation. This instruction must have the destination as a + // dependency. In case of other instruction than has the destination as a dependency, this + // dependency would mean the end of the traversal, but in this scenario this + // must be ignored. Let's say that we can not control what nodes to process + // and we come across the copy source. How do I know what node has that copy + // source as their dependency? We can check of which node is the copy source + // the dependency of. This list will alway contain the source. To decide if we + // have it as dependency of another instruction, we must check in the already + // traversed list if any of the instructions that is depended on the source is + // contained. This would introduce extra costs. + ProcessSNodeChildren(Edges, Dst, true); + while (!Edges.empty()) { + const auto *Current = Edges.front(); + Edges.pop(); + if (!ProcessSNodeChildren(Edges, Current, false)) + return false; + } + + // If all of the dependencies were deemed valid during the BFS then we + // are moving them before the copy source here keeping their relative + // order to each other. + auto CurrentInst = SrcInstr->getIterator(); + for (int I = 0; I < SectionSize; I++) { + if (SectionInstr[I]) + MBB->splice(SrcInstr->getIterator(), MBB, CurrentInst->getIterator()); + ++CurrentInst; + } + return true; +} static std::optional isCopyInstr(const MachineInstr &MI, const TargetInstrInfo &TII, @@ -114,6 +235,7 @@ class CopyTracker { }; DenseMap Copies; + DenseMap InvalidCopies; public: /// Mark all of the given registers and their subregisters as unavailable for @@ -130,9 +252,14 @@ class CopyTracker { } } + int getInvalidCopiesSize() { + return InvalidCopies.size(); + } + /// Remove register from copy maps. void invalidateRegister(MCRegister Reg, const TargetRegisterInfo &TRI, - const TargetInstrInfo &TII, bool UseCopyInstr) { + const TargetInstrInfo &TII, bool UseCopyInstr, + bool MayStillBePropagated = false) { // Since Reg might be a subreg of some registers, only invalidate Reg is not // enough. We have to find the COPY defines Reg or registers defined by Reg // and invalidate all of them. Similarly, we must invalidate all of the @@ -158,8 +285,11 @@ class CopyTracker { InvalidateCopy(MI); } } - for (MCRegUnit Unit : RegUnitsToInvalidate) + for (MCRegUnit Unit : RegUnitsToInvalidate) { + if (Copies.contains(Unit) && MayStillBePropagated) + InvalidCopies[Unit] = Copies[Unit]; Copies.erase(Unit); + } } /// Clobber a single register, removing it from the tracker's copy maps. @@ -252,6 +382,10 @@ class CopyTracker { return !Copies.empty(); } + bool hasAnyInvalidCopies() { + return !InvalidCopies.empty(); + } + MachineInstr *findCopyForUnit(MCRegUnit RegUnit, const TargetRegisterInfo &TRI, bool MustBeAvailable = false) { @@ -263,6 +397,17 @@ class CopyTracker { return CI->second.MI; } + MachineInstr *findInvalidCopyForUnit(MCRegUnit RegUnit, + const TargetRegisterInfo &TRI, + bool MustBeAvailable = false) { + auto CI = InvalidCopies.find(RegUnit); + if (CI == InvalidCopies.end()) + return nullptr; + if (MustBeAvailable && !CI->second.Avail) + return nullptr; + return CI->second.MI; + } + MachineInstr *findCopyDefViaUnit(MCRegUnit RegUnit, const TargetRegisterInfo &TRI) { auto CI = Copies.find(RegUnit); @@ -274,12 +419,28 @@ class CopyTracker { return findCopyForUnit(RU, TRI, true); } + MachineInstr *findInvalidCopyDefViaUnit(MCRegUnit RegUnit, + const TargetRegisterInfo &TRI) { + auto CI = InvalidCopies.find(RegUnit); + if (CI == InvalidCopies.end()) + return nullptr; + if (CI->second.DefRegs.size() != 1) + return nullptr; + MCRegUnit RU = *TRI.regunits(CI->second.DefRegs[0]).begin(); + return findInvalidCopyForUnit(RU, TRI, false); + } + + // TODO: This is ugly there shall be a more elegant solution to invalid + // copy searching. Create a variant that either returns a valid an invalid + // copy or no copy at all (std::monotype). MachineInstr *findAvailBackwardCopy(MachineInstr &I, MCRegister Reg, const TargetRegisterInfo &TRI, const TargetInstrInfo &TII, - bool UseCopyInstr) { + bool UseCopyInstr, + bool SearchInvalid = false) { MCRegUnit RU = *TRI.regunits(Reg).begin(); - MachineInstr *AvailCopy = findCopyDefViaUnit(RU, TRI); + MachineInstr *AvailCopy = SearchInvalid ? findInvalidCopyDefViaUnit(RU, TRI) + : findCopyDefViaUnit(RU, TRI); if (!AvailCopy) return nullptr; @@ -377,13 +538,20 @@ class CopyTracker { void clear() { Copies.clear(); + InvalidCopies.clear(); } }; +using Copy = MachineInstr*; +using InvalidCopy = std::pair; +using CopyLookupResult = std::variant; + class MachineCopyPropagation : public MachineFunctionPass { + LiveIntervals *LIS = nullptr; const TargetRegisterInfo *TRI = nullptr; const TargetInstrInfo *TII = nullptr; const MachineRegisterInfo *MRI = nullptr; + AAResults *AA = nullptr; // Return true if this is a copy instruction and false otherwise. bool UseCopyInstr; @@ -398,6 +566,7 @@ class MachineCopyPropagation : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addUsedIfAvailable(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -414,11 +583,11 @@ class MachineCopyPropagation : public MachineFunctionPass { void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT); void readSuccessorLiveIns(const MachineBasicBlock &MBB); void ForwardCopyPropagateBlock(MachineBasicBlock &MBB); - void BackwardCopyPropagateBlock(MachineBasicBlock &MBB); + void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, bool ResolveAntiDeps = false); void EliminateSpillageCopies(MachineBasicBlock &MBB); bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def); void forwardUses(MachineInstr &MI); - void propagateDefs(MachineInstr &MI); + void propagateDefs(MachineInstr &MI, ScheduleDAGMCP &DG, bool ResolveAntiDeps = false); bool isForwardableRegClassCopy(const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx); bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy, @@ -427,7 +596,7 @@ class MachineCopyPropagation : public MachineFunctionPass { bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use); bool hasOverlappingMultipleDef(const MachineInstr &MI, const MachineOperand &MODef, Register Def); - + /// Candidates for deletion. SmallSetVector MaybeDeadCopies; @@ -986,8 +1155,10 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands, return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill(); } -void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { - if (!Tracker.hasAnyCopies()) +void MachineCopyPropagation::propagateDefs(MachineInstr &MI, + ScheduleDAGMCP &DG, + bool MoveDependenciesForBetterCopyPropagation) { + if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies()) return; for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd; @@ -1010,8 +1181,30 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { MachineInstr *Copy = Tracker.findAvailBackwardCopy( MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr); - if (!Copy) - continue; + if (!Copy) { + if (!MoveDependenciesForBetterCopyPropagation) + continue; + + LLVM_DEBUG( + dbgs() + << "MCP: Couldn't find any backward copy that has no dependency.\n"); + Copy = Tracker.findAvailBackwardCopy(MI, MODef.getReg().asMCReg(), *TRI, + *TII, UseCopyInstr, true); + if (!Copy) { + LLVM_DEBUG( + dbgs() + << "MCP: Couldn't find any backward copy that has dependency.\n"); + continue; + } + LLVM_DEBUG( + dbgs() + << "MCP: Found potential backward copy that has dependency.\n"); + SUnit *DstSUnit = DG.getSUnit(Copy); + SUnit *SrcSUnit = DG.getSUnit(&MI); + + if (!moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG)) + continue; + } std::optional CopyOperands = isCopyInstr(*Copy, *TII, UseCopyInstr); @@ -1033,23 +1226,35 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI) { LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); + if (!MoveDependenciesForBetterCopyPropagation) { + MODef.setReg(Def); + MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); - MODef.setReg(Def); - MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); - - LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - MaybeDeadCopies.insert(Copy); - Changed = true; - ++NumCopyBackwardPropagated; + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); + MaybeDeadCopies.insert(Copy); + Changed = true; + ++NumCopyBackwardPropagated; + } } } void MachineCopyPropagation::BackwardCopyPropagateBlock( - MachineBasicBlock &MBB) { + MachineBasicBlock &MBB, bool MoveDependenciesForBetterCopyPropagation) { + ScheduleDAGMCP DG{*(MBB.getParent()), nullptr, false}; + if (MoveDependenciesForBetterCopyPropagation) { + DG.startBlock(&MBB); + DG.enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size()); + DG.buildSchedGraph(nullptr); + // DG.viewGraph(); + } + + LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName() << "\n"); for (MachineInstr &MI : llvm::make_early_inc_range(llvm::reverse(MBB))) { + //llvm::errs() << "Next MI: "; + //MI.dump(); // Ignore non-trivial COPYs. std::optional CopyOperands = isCopyInstr(MI, *TII, UseCopyInstr); @@ -1062,7 +1267,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( // just let forward cp do COPY-to-COPY propagation. if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) { Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, - UseCopyInstr); + UseCopyInstr, MoveDependenciesForBetterCopyPropagation); Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII, UseCopyInstr); Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); @@ -1077,10 +1282,10 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( MCRegister Reg = MO.getReg().asMCReg(); if (!Reg) continue; - Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr); + Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr, false); } - propagateDefs(MI); + propagateDefs(MI, DG, MoveDependenciesForBetterCopyPropagation); for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -1104,7 +1309,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( } } else { Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII, - UseCopyInstr); + UseCopyInstr, MoveDependenciesForBetterCopyPropagation); } } } @@ -1122,6 +1327,15 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Copy->eraseFromParent(); ++NumDeletes; } + if (MoveDependenciesForBetterCopyPropagation) { + DG.exitRegion(); + DG.finishBlock(); + // QUESTION: Does it makes sense to keep the kill flags here? + // On the other parts of this pass we juts throw out + // the kill flags. + DG.fixupKills(MBB); + } + MaybeDeadCopies.clear(); CopyDbgUsers.clear(); @@ -1472,11 +1686,29 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { TRI = MF.getSubtarget().getRegisterInfo(); TII = MF.getSubtarget().getInstrInfo(); MRI = &MF.getRegInfo(); + auto *LISWrapper = getAnalysisIfAvailable(); + LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; for (MachineBasicBlock &MBB : MF) { if (isSpillageCopyElimEnabled) EliminateSpillageCopies(MBB); + + // BackwardCopyPropagateBlock happens in two stages. + // First we move those unnecessary dependencies out of the way + // that may block copy propagations. + // + // The reason for this two stage approach is that the ScheduleDAG can not + // handle register renaming. + // QUESTION: I think these two stages could be merged together, if I were to change + // the renaming mechanism. + // + // The renaming wouldn't happen instantly. There would be a data structure + // that contained what register should be renamed to what. Then after the + // backward propagation has concluded the renaming would happen. + BackwardCopyPropagateBlock(MBB, true); + // Then we do the actual copy propagation. BackwardCopyPropagateBlock(MBB); + ForwardCopyPropagateBlock(MBB); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index de3f323891a36..92575d701f428 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -6026,8 +6026,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w19, w1 +; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6133,8 +6133,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w19, w1 +; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6238,8 +6238,8 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w19, w1 +; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6336,8 +6336,8 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov x19, x1 +; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: mov x0, x19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6434,8 +6434,8 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov x19, x1 +; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: mov x0, x19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll index c6819ff39ed33..a1377d6f89d67 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-pcsections.ll @@ -33,8 +33,8 @@ define i32 @val_compare_and_swap(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: bb.4.cmpxchg.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acquire acquire, !pcsections !0 %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -73,8 +73,8 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) { ; CHECK-NEXT: bb.4.cmpxchg.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %new = load i32, ptr %pnew, !pcsections !0 %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acquire acquire, !pcsections !0 %val = extractvalue { i32, i1 } %pair, 0 @@ -112,8 +112,8 @@ define i32 @val_compare_and_swap_rel(ptr %p, i32 %cmp, i32 %new) { ; CHECK-NEXT: bb.4.cmpxchg.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %pair = cmpxchg ptr %p, i32 %cmp, i32 %new acq_rel monotonic, !pcsections !0 %val = extractvalue { i32, i1 } %pair, 0 ret i32 %val @@ -151,7 +151,7 @@ define i64 @val_compare_and_swap_64(ptr %p, i64 %cmp, i64 %new) { ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = ORRXrs $xzr, killed $x8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %pair = cmpxchg ptr %p, i64 %cmp, i64 %new monotonic monotonic, !pcsections !0 %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val @@ -189,7 +189,7 @@ define i64 @val_compare_and_swap_64_monotonic_seqcst(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = ORRXrs $xzr, killed $x8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %pair = cmpxchg ptr %p, i64 %cmp, i64 %new monotonic seq_cst, !pcsections !0 %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val @@ -227,7 +227,7 @@ define i64 @val_compare_and_swap_64_release_acquire(ptr %p, i64 %cmp, i64 %new) ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = ORRXrs $xzr, killed $x8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %pair = cmpxchg ptr %p, i64 %cmp, i64 %new release acquire, !pcsections !0 %val = extractvalue { i64, i1 } %pair, 0 ret i64 %val @@ -252,8 +252,8 @@ define i32 @fetch_and_nand(ptr %p) { ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %val = atomicrmw nand ptr %p, i32 7 release, !pcsections !0 ret i32 %val } @@ -278,7 +278,7 @@ define i64 @fetch_and_nand_64(ptr %p) { ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = ORRXrs $xzr, killed $x8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %val = atomicrmw nand ptr %p, i64 7 acq_rel, !pcsections !0 ret i64 %val } @@ -303,8 +303,8 @@ define i32 @fetch_and_or(ptr %p) { ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %val = atomicrmw or ptr %p, i32 5 seq_cst, !pcsections !0 ret i32 %val } @@ -328,7 +328,7 @@ define i64 @fetch_and_or_64(ptr %p) { ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x0 = ORRXrs $xzr, killed $x8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %val = atomicrmw or ptr %p, i64 7 monotonic, !pcsections !0 ret i64 %val } @@ -366,7 +366,7 @@ define i32 @atomic_load(ptr %p) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p) - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %r = load atomic i32, ptr %p seq_cst, align 4, !pcsections !0 ret i32 %r } @@ -384,7 +384,7 @@ define i8 @atomic_load_relaxed_8(ptr %p, i32 %off32) { ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 0, pcsections !0 ; CHECK-NEXT: renamable $w9 = LDRBBui killed renamable $x9, 0, pcsections !0 :: (load unordered (s8) from %ir.ptr_random) ; CHECK-NEXT: renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 0, pcsections !0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %ptr_unsigned = getelementptr i8, ptr %p, i32 4095 %val_unsigned = load atomic i8, ptr %ptr_unsigned monotonic, align 1, !pcsections !0 @@ -416,7 +416,7 @@ define i16 @atomic_load_relaxed_16(ptr %p, i32 %off32) { ; CHECK-NEXT: renamable $w8 = ADDWrx killed renamable $w8, killed renamable $w10, 8, pcsections !0 ; CHECK-NEXT: renamable $w9 = LDRHHui killed renamable $x9, 0, pcsections !0 :: (load unordered (s16) from %ir.ptr_random) ; CHECK-NEXT: renamable $w0 = ADDWrx killed renamable $w8, killed renamable $w9, 8, pcsections !0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %ptr_unsigned = getelementptr i16, ptr %p, i32 4095 %val_unsigned = load atomic i16, ptr %ptr_unsigned monotonic, align 2, !pcsections !0 @@ -448,7 +448,7 @@ define i32 @atomic_load_relaxed_32(ptr %p, i32 %off32) { ; CHECK-NEXT: renamable $w9 = LDRWui killed renamable $x11, 0, pcsections !0 :: (load unordered (s32) from %ir.ptr_random) ; CHECK-NEXT: $w8 = ADDWrs killed renamable $w8, killed renamable $w10, 0, pcsections !0 ; CHECK-NEXT: $w0 = ADDWrs killed renamable $w8, killed renamable $w9, 0, pcsections !0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %ptr_unsigned = getelementptr i32, ptr %p, i32 4095 %val_unsigned = load atomic i32, ptr %ptr_unsigned monotonic, align 4, !pcsections !0 @@ -480,7 +480,7 @@ define i64 @atomic_load_relaxed_64(ptr %p, i32 %off32) { ; CHECK-NEXT: renamable $x9 = LDRXui killed renamable $x11, 0, pcsections !0 :: (load unordered (s64) from %ir.ptr_random) ; CHECK-NEXT: $x8 = ADDXrs killed renamable $x8, killed renamable $x10, 0, pcsections !0 ; CHECK-NEXT: $x0 = ADDXrs killed renamable $x8, killed renamable $x9, 0, pcsections !0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 %ptr_unsigned = getelementptr i64, ptr %p, i32 4095 %val_unsigned = load atomic i64, ptr %ptr_unsigned monotonic, align 8, !pcsections !0 @@ -624,7 +624,7 @@ define i32 @load_zext(ptr %p8, ptr %p16) { ; CHECK-NOLSE-NEXT: renamable $w8 = LDARB killed renamable $x0, pcsections !0 :: (load acquire (s8) from %ir.p8) ; CHECK-NOLSE-NEXT: renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16) ; CHECK-NOLSE-NEXT: renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0 - ; CHECK-NOLSE-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NOLSE-NEXT: RET undef $lr, implicit killed $w0 ; ; CHECK-LDAPR-LABEL: name: load_zext ; CHECK-LDAPR: bb.0 (%ir-block.0): @@ -633,7 +633,7 @@ define i32 @load_zext(ptr %p8, ptr %p16) { ; CHECK-LDAPR-NEXT: renamable $w8 = LDAPRB killed renamable $x0, pcsections !0 :: (load acquire (s8) from %ir.p8) ; CHECK-LDAPR-NEXT: renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16) ; CHECK-LDAPR-NEXT: renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 0, pcsections !0 - ; CHECK-LDAPR-NEXT: RET undef $lr, implicit $w0 + ; CHECK-LDAPR-NEXT: RET undef $lr, implicit killed $w0 %val1.8 = load atomic i8, ptr %p8 acquire, align 1, !pcsections !0 %val1 = zext i8 %val1.8 to i32 @@ -651,7 +651,7 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) { ; CHECK-NOLSE-NEXT: {{ $}} ; CHECK-NOLSE-NEXT: renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p32) ; CHECK-NOLSE-NEXT: renamable $x1 = LDARX killed renamable $x1, pcsections !0 :: (load acquire (s64) from %ir.p64) - ; CHECK-NOLSE-NEXT: RET undef $lr, implicit $w0, implicit $x1 + ; CHECK-NOLSE-NEXT: RET undef $lr, implicit killed $w0, implicit killed $x1 ; ; CHECK-LDAPR-LABEL: name: load_acq ; CHECK-LDAPR: bb.0 (%ir-block.0): @@ -659,7 +659,7 @@ define { i32, i64 } @load_acq(ptr %p32, ptr %p64) { ; CHECK-LDAPR-NEXT: {{ $}} ; CHECK-LDAPR-NEXT: renamable $w0 = LDARW killed renamable $x0, pcsections !0 :: (load seq_cst (s32) from %ir.p32) ; CHECK-LDAPR-NEXT: renamable $x1 = LDAPRX killed renamable $x1, pcsections !0 :: (load acquire (s64) from %ir.p64) - ; CHECK-LDAPR-NEXT: RET undef $lr, implicit $w0, implicit $x1 + ; CHECK-LDAPR-NEXT: RET undef $lr, implicit killed $w0, implicit killed $x1 %val32 = load atomic i32, ptr %p32 seq_cst, align 4, !pcsections !0 %tmp = insertvalue { i32, i64 } undef, i32 %val32, 0 @@ -678,7 +678,7 @@ define i32 @load_sext(ptr %p8, ptr %p16) { ; CHECK-NOLSE-NEXT: renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16) ; CHECK-NOLSE-NEXT: renamable $w9 = SBFMWri killed renamable $w9, 0, 15 ; CHECK-NOLSE-NEXT: renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 32, pcsections !0 - ; CHECK-NOLSE-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NOLSE-NEXT: RET undef $lr, implicit killed $w0 ; ; CHECK-LDAPR-LABEL: name: load_sext ; CHECK-LDAPR: bb.0 (%ir-block.0): @@ -688,7 +688,7 @@ define i32 @load_sext(ptr %p8, ptr %p16) { ; CHECK-LDAPR-NEXT: renamable $w9 = LDRHHui killed renamable $x1, 0, pcsections !0 :: (load unordered (s16) from %ir.p16) ; CHECK-LDAPR-NEXT: renamable $w9 = SBFMWri killed renamable $w9, 0, 15 ; CHECK-LDAPR-NEXT: renamable $w0 = ADDWrx killed renamable $w9, killed renamable $w8, 32, pcsections !0 - ; CHECK-LDAPR-NEXT: RET undef $lr, implicit $w0 + ; CHECK-LDAPR-NEXT: RET undef $lr, implicit killed $w0 %val1.8 = load atomic i8, ptr %p8 acquire, align 1, !pcsections !0 %val1 = sext i8 %val1.8 to i32 @@ -728,14 +728,14 @@ define i8 @atomicrmw_add_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw add ptr %ptr, i8 %rhs seq_cst, !pcsections !0 ret i8 %res } @@ -746,7 +746,7 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $w1 = KILL $w1, implicit-def $x1 + ; CHECK-NEXT: renamable $w1 = KILL killed $w1, implicit-def $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) @@ -759,8 +759,8 @@ define i8 @atomicrmw_xchg_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw xchg ptr %ptr, i8 %rhs monotonic, !pcsections !0 ret i8 %res } @@ -777,14 +777,14 @@ define i8 @atomicrmw_sub_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw sub ptr %ptr, i8 %rhs acquire, !pcsections !0 ret i8 %res } @@ -801,14 +801,14 @@ define i8 @atomicrmw_and_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw and ptr %ptr, i8 %rhs release, !pcsections !0 ret i8 %res } @@ -825,14 +825,14 @@ define i8 @atomicrmw_or_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw or ptr %ptr, i8 %rhs seq_cst, !pcsections !0 ret i8 %res } @@ -849,14 +849,14 @@ define i8 @atomicrmw_xor_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) ; CHECK-NEXT: $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw xor ptr %ptr, i8 %rhs monotonic, !pcsections !0 ret i8 %res } @@ -875,14 +875,14 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw min ptr %ptr, i8 %rhs acquire, !pcsections !0 ret i8 %res } @@ -901,14 +901,14 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: renamable $w9 = SBFMWri renamable $w8, 0, 7, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 32, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRB killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw max ptr %ptr, i8 %rhs release, !pcsections !0 ret i8 %res } @@ -926,17 +926,17 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) - ; CHECK-NEXT: renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8 + ; CHECK-NEXT: renamable $w8 = ANDWri killed renamable $w8, 7, implicit $x8 ; CHECK-NEXT: $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w11 = STLXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w11 = STLXRB killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w11, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $w8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw umin ptr %ptr, i8 %rhs seq_cst, !pcsections !0 ret i8 %res } @@ -954,17 +954,17 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) { ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRB renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s8) from %ir.ptr) - ; CHECK-NEXT: renamable $w8 = ANDWri renamable $w8, 7, implicit killed $x8 + ; CHECK-NEXT: renamable $w8 = ANDWri killed renamable $w8, 7, implicit $x8 ; CHECK-NEXT: $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w11 = STXRB renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w11 = STXRB killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s8) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w11, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $w8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw umax ptr %ptr, i8 %rhs monotonic, !pcsections !0 ret i8 %res } @@ -981,14 +981,14 @@ define i16 @atomicrmw_add_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: $w9 = ADDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw add ptr %ptr, i16 %rhs seq_cst, !pcsections !0 ret i16 %res } @@ -999,7 +999,7 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $w1, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $w1 = KILL $w1, implicit-def $x1 + ; CHECK-NEXT: renamable $w1 = KILL killed $w1, implicit-def $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.atomicrmw.start: ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) @@ -1012,8 +1012,8 @@ define i16 @atomicrmw_xchg_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw xchg ptr %ptr, i16 %rhs monotonic, !pcsections !0 ret i16 %res } @@ -1030,14 +1030,14 @@ define i16 @atomicrmw_sub_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: $w9 = SUBWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw sub ptr %ptr, i16 %rhs acquire, !pcsections !0 ret i16 %res } @@ -1054,14 +1054,14 @@ define i16 @atomicrmw_and_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: $w9 = ANDWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw and ptr %ptr, i16 %rhs release, !pcsections !0 ret i16 %res } @@ -1078,14 +1078,14 @@ define i16 @atomicrmw_or_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: $w9 = ORRWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw or ptr %ptr, i16 %rhs seq_cst, !pcsections !0 ret i16 %res } @@ -1102,14 +1102,14 @@ define i16 @atomicrmw_xor_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) ; CHECK-NEXT: $w9 = EORWrs renamable $w8, renamable $w1, 0, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw xor ptr %ptr, i16 %rhs monotonic, !pcsections !0 ret i16 %res } @@ -1128,14 +1128,14 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w9 = CSELWr renamable $w8, renamable $w1, 11, implicit killed $nzcv, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw min ptr %ptr, i16 %rhs acquire, !pcsections !0 ret i16 %res } @@ -1154,14 +1154,14 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: renamable $w9 = SBFMWri renamable $w8, 0, 15, pcsections !0 ; CHECK-NEXT: dead $wzr = SUBSWrx killed renamable $w9, renamable $w1, 40, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w9 = CSELWr renamable $w8, renamable $w1, 12, implicit killed $nzcv, implicit-def $x9, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH renamable $w9, renamable $x0, implicit killed $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w10 = STLXRH killed renamable $w9, renamable $x0, implicit $x9, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w10, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w0 = ORRWrs $wzr, $w8, 0, implicit killed $x8 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0, implicit $x8 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw max ptr %ptr, i16 %rhs release, !pcsections !0 ret i16 %res } @@ -1179,17 +1179,17 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDAXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) - ; CHECK-NEXT: renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8 + ; CHECK-NEXT: renamable $w8 = ANDWri killed renamable $w8, 15, implicit $x8 ; CHECK-NEXT: $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w10 = CSELWr renamable $w8, renamable $w9, 3, implicit killed $nzcv, implicit-def $x10, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w11 = STLXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w11 = STLXRH killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w11, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $w8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw umin ptr %ptr, i16 %rhs seq_cst, !pcsections !0 ret i16 %res } @@ -1207,17 +1207,17 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) { ; CHECK-NEXT: liveins: $w9, $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w8 = LDXRH renamable $x0, implicit-def $x8, pcsections !0 :: (volatile load (s16) from %ir.ptr) - ; CHECK-NEXT: renamable $w8 = ANDWri renamable $w8, 15, implicit killed $x8 + ; CHECK-NEXT: renamable $w8 = ANDWri killed renamable $w8, 15, implicit $x8 ; CHECK-NEXT: $wzr = SUBSWrs renamable $w8, renamable $w9, 0, implicit-def $nzcv, pcsections !0 ; CHECK-NEXT: renamable $w10 = CSELWr renamable $w8, renamable $w9, 8, implicit killed $nzcv, implicit-def $x10, pcsections !0 - ; CHECK-NEXT: early-clobber renamable $w11 = STXRH renamable $w10, renamable $x0, implicit killed $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr) + ; CHECK-NEXT: early-clobber renamable $w11 = STXRH killed renamable $w10, renamable $x0, implicit $x10, pcsections !0 :: (volatile store (s16) into %ir.ptr) ; CHECK-NEXT: CBNZW killed renamable $w11, %bb.1, pcsections !0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2.atomicrmw.end: ; CHECK-NEXT: liveins: $w8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w0 = ORRWrs $wzr, killed $w8, 0 - ; CHECK-NEXT: RET undef $lr, implicit $w0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0 %res = atomicrmw umax ptr %ptr, i16 %rhs monotonic, !pcsections !0 ret i16 %res } @@ -1228,8 +1228,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x8 = ORRXrs $xzr, $x0, 0 - ; CHECK-NEXT: renamable $w2 = KILL $w2, implicit-def $x2 + ; CHECK-NEXT: $x8 = ORRXrs $xzr, killed $x0, 0 + ; CHECK-NEXT: renamable $w2 = KILL killed $w2, implicit-def $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) @@ -1251,16 +1251,16 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w1 = MOVZWi 1, 0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0, implicit killed $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.cmpxchg.nostore: ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w1 = ORRWrs $wzr, $wzr, 0 ; CHECK-NEXT: CLREX 15, pcsections !0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0, implicit killed $w1 %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic, !pcsections !0 ret { i8, i1 } %res } @@ -1271,8 +1271,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $w1, $w2, $x0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x8 = ORRXrs $xzr, $x0, 0 - ; CHECK-NEXT: renamable $w2 = KILL $w2, implicit-def $x2 + ; CHECK-NEXT: $x8 = ORRXrs $xzr, killed $x0, 0 + ; CHECK-NEXT: renamable $w2 = KILL killed $w2, implicit-def $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1.cmpxchg.start: ; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000) @@ -1294,16 +1294,16 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $w1 = MOVZWi 1, 0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0, implicit killed $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4.cmpxchg.nostore: ; CHECK-NEXT: liveins: $x0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $w1 = ORRWrs $wzr, $wzr, 0 ; CHECK-NEXT: CLREX 15, pcsections !0 - ; CHECK-NEXT: $w0 = KILL renamable $w0, implicit killed $x0 - ; CHECK-NEXT: RET undef $lr, implicit $w0, implicit $w1 + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: RET undef $lr, implicit killed $w0, implicit killed $w1 %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic, !pcsections !0 ret { i16, i1 } %res } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll index 7fd71b26fa1ba..c8f8361e5ef88 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/merge-stores-truncating.ll @@ -256,9 +256,8 @@ define dso_local i32 @load_between_stores(i32 %x, ptr %p, ptr %ptr) { ; CHECK: ; %bb.0: ; CHECK-NEXT: strh w0, [x1] ; CHECK-NEXT: lsr w9, w0, #16 -; CHECK-NEXT: ldr w8, [x2] +; CHECK-NEXT: ldr w0, [x2] ; CHECK-NEXT: strh w9, [x1, #2] -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %t1 = trunc i32 %x to i16 %sh = lshr i32 %x, 16 diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll index e11ae9a251590..63cfa19ab3d49 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll @@ -510,10 +510,9 @@ define i128 @mulv_v2i128(<2 x i128> %a) { ; CHECK-GI-LABEL: mulv_v2i128: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mul x9, x0, x3 -; CHECK-GI-NEXT: mul x8, x0, x2 ; CHECK-GI-NEXT: umulh x10, x0, x2 ; CHECK-GI-NEXT: madd x9, x1, x2, x9 -; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mul x0, x0, x2 ; CHECK-GI-NEXT: add x1, x9, x10 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll index 410c2d9021d6d..a150a0f6ee40a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-wide-mul.ll @@ -131,13 +131,12 @@ entry: define <16 x i32> @mla_i32(<16 x i8> %a, <16 x i8> %b, <16 x i32> %c) { ; CHECK-SD-LABEL: mla_i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umull2 v7.8h, v0.16b, v1.16b ; CHECK-SD-NEXT: umull v6.8h, v0.8b, v1.8b -; CHECK-SD-NEXT: uaddw2 v5.4s, v5.4s, v7.8h +; CHECK-SD-NEXT: umull2 v7.8h, v0.16b, v1.16b ; CHECK-SD-NEXT: uaddw v0.4s, v2.4s, v6.4h ; CHECK-SD-NEXT: uaddw2 v1.4s, v3.4s, v6.8h +; CHECK-SD-NEXT: uaddw2 v3.4s, v5.4s, v7.8h ; CHECK-SD-NEXT: uaddw v2.4s, v4.4s, v7.4h -; CHECK-SD-NEXT: mov v3.16b, v5.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: mla_i32: @@ -170,18 +169,17 @@ define <16 x i64> @mla_i64(<16 x i8> %a, <16 x i8> %b, <16 x i64> %c) { ; CHECK-SD-NEXT: umull2 v0.8h, v0.16b, v1.16b ; CHECK-SD-NEXT: ldp q20, q21, [sp] ; CHECK-SD-NEXT: ushll v17.4s, v16.4h, #0 +; CHECK-SD-NEXT: ushll v18.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll2 v16.4s, v16.8h, #0 ; CHECK-SD-NEXT: ushll2 v19.4s, v0.8h, #0 -; CHECK-SD-NEXT: ushll v18.4s, v0.4h, #0 ; CHECK-SD-NEXT: uaddw2 v1.2d, v3.2d, v17.4s ; CHECK-SD-NEXT: uaddw v0.2d, v2.2d, v17.2s ; CHECK-SD-NEXT: uaddw2 v3.2d, v5.2d, v16.4s ; CHECK-SD-NEXT: uaddw v2.2d, v4.2d, v16.2s -; CHECK-SD-NEXT: uaddw2 v16.2d, v21.2d, v19.4s ; CHECK-SD-NEXT: uaddw v4.2d, v6.2d, v18.2s ; CHECK-SD-NEXT: uaddw2 v5.2d, v7.2d, v18.4s +; CHECK-SD-NEXT: uaddw2 v7.2d, v21.2d, v19.4s ; CHECK-SD-NEXT: uaddw v6.2d, v20.2d, v19.2s -; CHECK-SD-NEXT: mov v7.16b, v16.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: mla_i64: diff --git a/llvm/test/CodeGen/AArch64/addp-shuffle.ll b/llvm/test/CodeGen/AArch64/addp-shuffle.ll index fb96d11acc275..8a9cca866ff05 100644 --- a/llvm/test/CodeGen/AArch64/addp-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/addp-shuffle.ll @@ -63,9 +63,8 @@ define <16 x i8> @deinterleave_shuffle_v32i8(<32 x i8> %a) { define <4 x i64> @deinterleave_shuffle_v8i64(<8 x i64> %a) { ; CHECK-LABEL: deinterleave_shuffle_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: addp v2.2d, v2.2d, v3.2d ; CHECK-NEXT: addp v0.2d, v0.2d, v1.2d -; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: addp v1.2d, v2.2d, v3.2d ; CHECK-NEXT: ret %r0 = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> %r1 = shufflevector <8 x i64> %a, <8 x i64> poison, <4 x i32> @@ -123,9 +122,8 @@ define <8 x half> @deinterleave_shuffle_v16f16(<16 x half> %a) { define <4 x double> @deinterleave_shuffle_v8f64(<8 x double> %a) { ; CHECK-LABEL: deinterleave_shuffle_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: faddp v2.2d, v2.2d, v3.2d ; CHECK-NEXT: faddp v0.2d, v0.2d, v1.2d -; CHECK-NEXT: mov v1.16b, v2.16b +; CHECK-NEXT: faddp v1.2d, v2.2d, v3.2d ; CHECK-NEXT: ret %r0 = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> %r1 = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir new file mode 100644 index 0000000000000..c3a59990ccd25 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir @@ -0,0 +1,201 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=aarch64 -run-pass machine-cp -verify-machineinstrs -o - %s | FileCheck %s +--- | + source_filename = "llvmirrepoW.ll" + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + + declare dso_local i32 @chain(i32 noundef, i32 noundef) local_unnamed_addr + + declare dso_local void @init_var(ptr noundef) local_unnamed_addr + + define dso_local void @fun2(i64 %a, i64 %b) local_unnamed_addr { + entry: + %c = alloca i32, align 4 + ret void + } + define dso_local void @blocker(i64 %a, i64 %b) local_unnamed_addr { + entry: + %c = alloca i32, align 4 + ret void + } + +... +--- +name: fun2 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$x0', virtual-reg: '' } + - { reg: '$x1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 4 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: c, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -4, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: fun2 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1 + ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: renamable $w1 = COPY killed $w0 + ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $w0 = KILL renamable $w0, implicit killed $x0 + $w1 = KILL renamable $w1, implicit killed $x1 + BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) + renamable $w1 = COPY $w0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $w0 = COPY killed renamable $w8 + BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def dead $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + RET_ReallyLR + +... + +--- +name: blocker +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +callsEHReturn: false +callsUnwindInit: false +hasEHCatchret: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$x0', virtual-reg: '' } + - { reg: '$x1', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: true + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 4 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: c, type: default, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -4, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: blocker + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: $w0 = KILL killed renamable $w0, implicit $x0 + ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1 + ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) + ; CHECK-NEXT: renamable $w1 = COPY $w0 + ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0 + ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0 + ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: RET_ReallyLR + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $w0 = KILL renamable $w0, implicit killed $x0 + $w1 = KILL renamable $w1, implicit killed $x1 + BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) + renamable $w8 = ADDWrr $w0, $w0 + renamable $w1 = COPY $w0 + ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp + $w0 = COPY killed renamable $w8 + BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit $w1, implicit-def $sp, implicit-def dead $w0 + ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll index c2ec0502d83bd..cd821675bae6e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -28,9 +28,8 @@ define i56 @ldi56(ptr %p) nounwind { define i80 @ldi80(ptr %p) nounwind { ; CHECK-LABEL: ldi80: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: ldrh w1, [x0, #8] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ldr x0, [x0] ; CHECK-NEXT: ret %r = load i80, ptr %p ret i80 %r @@ -55,8 +54,9 @@ define i280 @ldi280(ptr %p) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w9, [x0, #34] ; CHECK-NEXT: ldrh w10, [x0, #32] -; CHECK-NEXT: ldp x8, x1, [x0] -; CHECK-NEXT: ldp x2, x3, [x0, #16] +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: ldp x1, x2, [x0, #8] +; CHECK-NEXT: ldr x3, [x0, #24] ; CHECK-NEXT: orr x4, x10, x9, lsl #16 ; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll index 1f5654d59926d..4f067ab1d48c1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll +++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll @@ -31,35 +31,21 @@ define <8 x i16> @func2(<8 x i8> %v0) nounwind { } define <16 x i16> @func3(<16 x i8> %v0) nounwind { -; CHECK-SD-LABEL: func3: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ushll2.8h v1, v0, #0 -; CHECK-SD-NEXT: ushll.8h v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: func3: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll.8h v2, v0, #0 -; CHECK-GI-NEXT: ushll2.8h v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: func3: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll2.8h v1, v0, #0 +; CHECK-NEXT: ushll.8h v0, v0, #0 +; CHECK-NEXT: ret %r = zext <16 x i8> %v0 to <16 x i16> ret <16 x i16> %r } define <16 x i16> @func4(<16 x i8> %v0) nounwind { -; CHECK-SD-LABEL: func4: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sshll2.8h v1, v0, #0 -; CHECK-SD-NEXT: sshll.8h v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: func4: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll.8h v2, v0, #0 -; CHECK-GI-NEXT: sshll2.8h v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: func4: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll2.8h v1, v0, #0 +; CHECK-NEXT: sshll.8h v0, v0, #0 +; CHECK-NEXT: ret %r = sext <16 x i8> %v0 to <16 x i16> ret <16 x i16> %r } @@ -87,35 +73,21 @@ define <4 x i32> @afunc2(<4 x i16> %v0) nounwind { } define <8 x i32> @afunc3(<8 x i16> %v0) nounwind { -; CHECK-SD-LABEL: afunc3: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ushll2.4s v1, v0, #0 -; CHECK-SD-NEXT: ushll.4s v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: afunc3: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll.4s v2, v0, #0 -; CHECK-GI-NEXT: ushll2.4s v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: afunc3: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll2.4s v1, v0, #0 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: ret %r = zext <8 x i16> %v0 to <8 x i32> ret <8 x i32> %r } define <8 x i32> @afunc4(<8 x i16> %v0) nounwind { -; CHECK-SD-LABEL: afunc4: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sshll2.4s v1, v0, #0 -; CHECK-SD-NEXT: sshll.4s v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: afunc4: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll.4s v2, v0, #0 -; CHECK-GI-NEXT: sshll2.4s v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: afunc4: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll2.4s v1, v0, #0 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: ret %r = sext <8 x i16> %v0 to <8 x i32> ret <8 x i32> %r } @@ -161,35 +133,21 @@ define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind { ;----- define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind { -; CHECK-SD-LABEL: zfunc1: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ushll2.2d v1, v0, #0 -; CHECK-SD-NEXT: ushll.2d v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: zfunc1: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ushll.2d v2, v0, #0 -; CHECK-GI-NEXT: ushll2.2d v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: zfunc1: +; CHECK: // %bb.0: +; CHECK-NEXT: ushll2.2d v1, v0, #0 +; CHECK-NEXT: ushll.2d v0, v0, #0 +; CHECK-NEXT: ret %r = zext <4 x i32> %v0 to <4 x i64> ret <4 x i64> %r } define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind { -; CHECK-SD-LABEL: zfunc2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: sshll2.2d v1, v0, #0 -; CHECK-SD-NEXT: sshll.2d v0, v0, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: zfunc2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: sshll.2d v2, v0, #0 -; CHECK-GI-NEXT: sshll2.2d v1, v0, #0 -; CHECK-GI-NEXT: mov.16b v0, v2 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: zfunc2: +; CHECK: // %bb.0: +; CHECK-NEXT: sshll2.2d v1, v0, #0 +; CHECK-NEXT: sshll.2d v0, v0, #0 +; CHECK-NEXT: ret %r = sext <4 x i32> %v0 to <4 x i64> ret <4 x i64> %r } diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll index bf559da91901c..e13dde101271a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -130,8 +130,7 @@ define dso_local void @copy_notcxx14aggregate(ptr inreg noalias sret(%struct.Not define dso_local [2 x i64] @copy_notpod(ptr %x) { ; CHECK-LABEL: copy_notpod: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp x8, x1, [x0] -; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ldp x0, x1, [x0] ; CHECK-NEXT: ret %x2 = load [2 x i64], ptr %x ret [2 x i64] %x2 diff --git a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir index b940734c6988c..bc2d715995b90 100644 --- a/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir +++ b/llvm/test/CodeGen/AArch64/avoid-zero-copy.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # Check that we can remove the redundant save of constant registers such as $wzr # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -start-before=machine-cp -o - | FileCheck %s --check-prefix ASM # RUN: llc -mtriple=aarch64-unknown-linux %s -verify-machineinstrs -run-pass=machine-cp -o - | FileCheck %s @@ -38,3 +39,5 @@ body: | $w0 = COPY killed renamable $w19 RET_ReallyLR implicit $w0 ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/cgp-usubo.ll b/llvm/test/CodeGen/AArch64/cgp-usubo.ll index d307107fc07ee..c8b7362508664 100644 --- a/llvm/test/CodeGen/AArch64/cgp-usubo.ll +++ b/llvm/test/CodeGen/AArch64/cgp-usubo.ll @@ -40,9 +40,8 @@ define i1 @usubo_ugt_constant_op0_i8(i8 %x, ptr %p) nounwind { ; CHECK-NEXT: mov w9, #42 // =0x2a ; CHECK-NEXT: cmp w8, #42 ; CHECK-NEXT: sub w9, w9, w0 -; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: strb w9, [x1] -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %s = sub i8 42, %x %ov = icmp ugt i8 %x, 42 @@ -59,9 +58,8 @@ define i1 @usubo_ult_constant_op0_i16(i16 %x, ptr %p) nounwind { ; CHECK-NEXT: mov w9, #43 // =0x2b ; CHECK-NEXT: cmp w8, #43 ; CHECK-NEXT: sub w9, w9, w0 -; CHECK-NEXT: cset w8, hi +; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %s = sub i16 43, %x %ov = icmp ult i16 43, %x @@ -78,8 +76,7 @@ define i1 @usubo_ult_constant_op1_i16(i16 %x, ptr %p) nounwind { ; CHECK-NEXT: sub w9, w0, #44 ; CHECK-NEXT: cmp w8, #44 ; CHECK-NEXT: strh w9, [x1] -; CHECK-NEXT: cset w8, lo -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %s = add i16 %x, -44 %ov = icmp ult i16 %x, 44 @@ -94,8 +91,7 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) nounwind { ; CHECK-NEXT: sub w9, w0, #45 ; CHECK-NEXT: cmp w8, #45 ; CHECK-NEXT: strb w9, [x1] -; CHECK-NEXT: cset w8, lo -; CHECK-NEXT: mov w0, w8 +; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %ov = icmp ugt i8 45, %x %s = add i8 %x, -45 @@ -110,9 +106,8 @@ define i1 @usubo_eq_constant1_op1_i32(i32 %x, ptr %p) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: cmp w0, #0 ; CHECK-NEXT: sub w9, w0, #1 -; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: str w9, [x1] -; CHECK-NEXT: mov w0, w8 ; CHECK-NEXT: ret %s = add i32 %x, -1 %ov = icmp eq i32 %x, 0 diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll index 186d191444feb..4558d7c464fe3 100644 --- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -31,10 +31,10 @@ define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w19, w1 -; OUTLINE-ATOMICS-NEXT: mov w0, w1 +; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 +; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas4_acq_rel ; OUTLINE-ATOMICS-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload @@ -81,10 +81,10 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w19, w1 -; OUTLINE-ATOMICS-NEXT: mov w0, w1 +; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 +; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas1_acq_rel ; OUTLINE-ATOMICS-NEXT: cmp w0, w19, uxtb @@ -126,10 +126,10 @@ define void @test_conditional(ptr %p, i32 %oldval, i32 %newval) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w19, w1 -; OUTLINE-ATOMICS-NEXT: mov w0, w1 +; OUTLINE-ATOMICS-NEXT: mov x8, x0 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 +; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas4_acq_rel ; OUTLINE-ATOMICS-NEXT: cmp w0, w19 diff --git a/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll index 420cf93a023d7..88dedb720f397 100644 --- a/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll +++ b/llvm/test/CodeGen/AArch64/duplane-index-patfrags.ll @@ -4,9 +4,8 @@ define <8 x half> @sel.v8f16.fmul(ptr %p, ptr %q, <8 x half> %a, <8 x half> %b, <4 x half> %c) { ; CHECK-LABEL: sel.v8f16.fmul: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul v1.8h, v1.8h, v0.h[0] ; CHECK-NEXT: fmul v2.4h, v2.4h, v0.h[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v0.8h, v1.8h, v0.h[0] ; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: ret %splat = shufflevector <8 x half> %a, <8 x half> poison, <8 x i32> zeroinitializer @@ -21,9 +20,8 @@ define <8 x half> @sel.v8f16.fmul(ptr %p, ptr %q, <8 x half> %a, <8 x half> %b, define <4 x float> @sel.v4f32.fmul(ptr %p, ptr %q, <4 x float> %a, <4 x float> %b, <2 x float> %c) { ; CHECK-LABEL: sel.v4f32.fmul: ; CHECK: // %bb.0: -; CHECK-NEXT: fmul v1.4s, v1.4s, v0.s[0] ; CHECK-NEXT: fmul v2.2s, v2.2s, v0.s[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: fmul v0.4s, v1.4s, v0.s[0] ; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: ret %splat = shufflevector <4 x float> %a, <4 x float> poison, <4 x i32> zeroinitializer @@ -38,9 +36,8 @@ define <4 x float> @sel.v4f32.fmul(ptr %p, ptr %q, <4 x float> %a, <4 x float> % define <8 x i16> @sel.v8i16.mul(ptr %p, ptr %q, <8 x i16> %a, <8 x i16> %b, <4 x i16> %c) { ; CHECK-LABEL: sel.v8i16.mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v1.8h, v1.8h, v0.h[0] ; CHECK-NEXT: mul v2.4h, v2.4h, v0.h[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mul v0.8h, v1.8h, v0.h[0] ; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: ret %splat = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> zeroinitializer @@ -55,9 +52,8 @@ define <8 x i16> @sel.v8i16.mul(ptr %p, ptr %q, <8 x i16> %a, <8 x i16> %b, <4 x define <4 x i32> @sel.v4i32.mul(ptr %p, ptr %q, <4 x i32> %a, <4 x i32> %b, <2 x i32> %c) { ; CHECK-LABEL: sel.v4i32.mul: ; CHECK: // %bb.0: -; CHECK-NEXT: mul v1.4s, v1.4s, v0.s[0] ; CHECK-NEXT: mul v2.2s, v2.2s, v0.s[0] -; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: mul v0.4s, v1.4s, v0.s[0] ; CHECK-NEXT: str d2, [x0] ; CHECK-NEXT: ret %splat = shufflevector <4 x i32> %a, <4 x i32> poison, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index a5d7ae147ffda..cd227a0235766 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -1618,23 +1618,23 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1 ; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5 ; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NOFP16-NEXT: fmov w9, s5 -; CHECK-GI-NOFP16-NEXT: fmov s5, w7 ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v6.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: fmov s5, w7 ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v7.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s7, [sp, #24] ; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w2 +; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 +; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s +; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w9 ; CHECK-GI-NOFP16-NEXT: fmov w9, s6 ; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 -; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0] ; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h -; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3 ; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9 @@ -1687,6 +1687,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] ; CHECK-GI-FP16-NEXT: fmov s1, w8 ; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] +; CHECK-GI-FP16-NEXT: fmov s5, w9 ; CHECK-GI-FP16-NEXT: mov v2.s[2], w10 ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 @@ -1694,20 +1695,19 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] ; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 ; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-FP16-NEXT: fmov s5, w9 +; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 ; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 ; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s3 ; CHECK-GI-FP16-NEXT: fmov s3, w7 -; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 ; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s4 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] ; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s -; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 ; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: fmov w8, s4 diff --git a/llvm/test/CodeGen/AArch64/fexplog.ll b/llvm/test/CodeGen/AArch64/fexplog.ll index 93d3d96d67b65..d4c4b461b82da 100644 --- a/llvm/test/CodeGen/AArch64/fexplog.ll +++ b/llvm/test/CodeGen/AArch64/fexplog.ll @@ -671,35 +671,29 @@ define <7 x half> @exp_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -782,20 +776,17 @@ define <4 x half> @exp_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -912,40 +903,33 @@ define <8 x half> @exp_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1148,40 +1132,33 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1189,46 +1166,39 @@ define <16 x half> @exp_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl expf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload @@ -1941,35 +1911,29 @@ define <7 x half> @exp2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2052,20 +2016,17 @@ define <4 x half> @exp2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2182,40 +2143,33 @@ define <8 x half> @exp2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2418,40 +2372,33 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2459,46 +2406,39 @@ define <16 x half> @exp2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl exp2f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload @@ -3211,35 +3151,29 @@ define <7 x half> @log_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -3322,20 +3256,17 @@ define <4 x half> @log_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -3452,40 +3383,33 @@ define <8 x half> @log_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -3688,40 +3612,33 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -3729,46 +3646,39 @@ define <16 x half> @log_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl logf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload @@ -4481,35 +4391,29 @@ define <7 x half> @log2_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -4592,20 +4496,17 @@ define <4 x half> @log2_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -4722,40 +4623,33 @@ define <8 x half> @log2_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -4958,40 +4852,33 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -4999,46 +4886,39 @@ define <16 x half> @log2_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log2f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload @@ -5751,35 +5631,29 @@ define <7 x half> @log10_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -5862,20 +5736,17 @@ define <4 x half> @log10_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -5992,40 +5863,33 @@ define <8 x half> @log10_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -6228,40 +6092,33 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -6269,46 +6126,39 @@ define <16 x half> @log10_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl log10f ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/fpext.ll b/llvm/test/CodeGen/AArch64/fpext.ll index d942839c577d2..b9cba619bfacc 100644 --- a/llvm/test/CodeGen/AArch64/fpext.ll +++ b/llvm/test/CodeGen/AArch64/fpext.ll @@ -106,18 +106,15 @@ define <4 x fp128> @fpext_v4f16_v4f128(<4 x half> %a) { ; CHECK-SD-NEXT: bl __extendhftf2 ; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: mov h1, v1.h[1] -; CHECK-SD-NEXT: fmov s0, s1 +; CHECK-SD-NEXT: mov h0, v1.h[1] ; CHECK-SD-NEXT: bl __extendhftf2 ; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-SD-NEXT: mov h1, v1.h[2] -; CHECK-SD-NEXT: fmov s0, s1 +; CHECK-SD-NEXT: mov h0, v1.h[2] ; CHECK-SD-NEXT: bl __extendhftf2 ; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: mov h1, v1.h[3] -; CHECK-SD-NEXT: fmov s0, s1 +; CHECK-SD-NEXT: mov h0, v1.h[3] ; CHECK-SD-NEXT: bl __extendhftf2 ; CHECK-SD-NEXT: mov v3.16b, v0.16b ; CHECK-SD-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload @@ -179,8 +176,7 @@ define <4 x fp128> @fpext_v4f32_v4f128(<4 x float> %a) { ; CHECK-SD-NEXT: bl __extendsftf2 ; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-SD-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: mov s1, v1.s[1] -; CHECK-SD-NEXT: fmov s0, s1 +; CHECK-SD-NEXT: mov s0, v1.s[1] ; CHECK-SD-NEXT: bl __extendsftf2 ; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload @@ -302,18 +298,11 @@ entry: } define <4 x double> @fpext_v4f32_v4f64(<4 x float> %a) { -; CHECK-SD-LABEL: fpext_v4f32_v4f64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtl2 v1.2d, v0.4s -; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fpext_v4f32_v4f64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtl v2.2d, v0.2s -; CHECK-GI-NEXT: fcvtl2 v1.2d, v0.4s -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fpext_v4f32_v4f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl2 v1.2d, v0.4s +; CHECK-NEXT: fcvtl v0.2d, v0.2s +; CHECK-NEXT: ret entry: %c = fpext <4 x float> %a to <4 x double> ret <4 x double> %c @@ -432,18 +421,11 @@ entry: } define <8 x float> @fpext_v8f16_v8f32(<8 x half> %a) { -; CHECK-SD-LABEL: fpext_v8f16_v8f32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fcvtl2 v1.4s, v0.8h -; CHECK-SD-NEXT: fcvtl v0.4s, v0.4h -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fpext_v8f16_v8f32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fcvtl v2.4s, v0.4h -; CHECK-GI-NEXT: fcvtl2 v1.4s, v0.8h -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fpext_v8f16_v8f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl2 v1.4s, v0.8h +; CHECK-NEXT: fcvtl v0.4s, v0.4h +; CHECK-NEXT: ret entry: %c = fpext <8 x half> %a to <8 x float> ret <8 x float> %c diff --git a/llvm/test/CodeGen/AArch64/fpow.ll b/llvm/test/CodeGen/AArch64/fpow.ll index 8d40121ad4543..f2f8ec56e32fc 100644 --- a/llvm/test/CodeGen/AArch64/fpow.ll +++ b/llvm/test/CodeGen/AArch64/fpow.ll @@ -813,29 +813,25 @@ define <7 x half> @pow_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -956,23 +952,20 @@ define <4 x half> @pow_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1120,29 +1113,25 @@ define <8 x half> @pow_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1437,37 +1426,32 @@ define <16 x half> @pow_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h10 +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h1 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h11 +; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h1 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: bl powf ; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 diff --git a/llvm/test/CodeGen/AArch64/fpowi.ll b/llvm/test/CodeGen/AArch64/fpowi.ll index 5dbcaa4a5fda1..dd59620eb020c 100644 --- a/llvm/test/CodeGen/AArch64/fpowi.ll +++ b/llvm/test/CodeGen/AArch64/fpowi.ll @@ -736,41 +736,35 @@ define <7 x half> @powi_v7f16(<7 x half> %a, i32 %b) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -860,23 +854,20 @@ define <4 x half> @powi_v4f16(<4 x half> %a, i32 %b) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1004,47 +995,40 @@ define <8 x half> @powi_v8f16(<8 x half> %a, i32 %b) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1267,47 +1251,40 @@ define <16 x half> @powi_v16f16(<16 x half> %a, i32 %b) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #176] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1316,53 +1293,46 @@ define <16 x half> @powi_v16f16(<16 x half> %a, i32 %b) { ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: mov w0, w19 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #80] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #112] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #112] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 +; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: mov w0, w19 -; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl __powisf2 ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #144] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/frem.ll b/llvm/test/CodeGen/AArch64/frem.ll index 1a10fd2f1cdc3..3d87a9193f21c 100644 --- a/llvm/test/CodeGen/AArch64/frem.ll +++ b/llvm/test/CodeGen/AArch64/frem.ll @@ -800,29 +800,25 @@ define <7 x half> @frem_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #174] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -943,23 +939,20 @@ define <4 x half> @frem_v4f16(<4 x half> %a, <4 x half> %b) { ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1107,29 +1100,25 @@ define <8 x half> @frem_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NEXT: fcvt s1, h1 ; CHECK-GI-NEXT: str h2, [sp, #190] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1424,37 +1413,32 @@ define <16 x half> @frem_v16f16(<16 x half> %a, <16 x half> %b) { ; CHECK-GI-NEXT: str h1, [sp, #302] // 2-byte Folded Spill ; CHECK-GI-NEXT: fcvt s1, h2 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: str q0, [sp, #304] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: fcvt s2, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: str q0, [sp, #320] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h10 +; CHECK-GI-NEXT: ldr h1, [sp, #272] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #272] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h1 +; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s2, h1 -; CHECK-GI-NEXT: fcvt s1, h11 +; CHECK-GI-NEXT: ldr h1, [sp, #240] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #240] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s2 +; CHECK-GI-NEXT: fcvt s0, h1 +; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: bl fmodf ; CHECK-GI-NEXT: ldr h1, [sp, #176] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 diff --git a/llvm/test/CodeGen/AArch64/fsincos.ll b/llvm/test/CodeGen/AArch64/fsincos.ll index 0b34f9570fa77..0ed2d5398d726 100644 --- a/llvm/test/CodeGen/AArch64/fsincos.ll +++ b/llvm/test/CodeGen/AArch64/fsincos.ll @@ -671,35 +671,29 @@ define <7 x half> @sin_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -782,20 +776,17 @@ define <4 x half> @sin_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -912,40 +903,33 @@ define <8 x half> @sin_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1148,40 +1132,33 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -1189,46 +1166,39 @@ define <16 x half> @sin_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl sinf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload @@ -1941,35 +1911,29 @@ define <7 x half> @cos_v7f16(<7 x half> %a) { ; CHECK-GI-NEXT: mov h13, v0.h[6] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #64] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2052,20 +2016,17 @@ define <4 x half> @cos_v4f16(<4 x half> %a) { ; CHECK-GI-NEXT: mov h10, v0.h[3] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2182,40 +2143,33 @@ define <8 x half> @cos_v8f16(<8 x half> %a) { ; CHECK-GI-NEXT: mov h14, v0.h[7] ; CHECK-GI-NEXT: fcvt s0, h0 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldp q2, q1, [sp, #80] // 32-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2418,40 +2372,33 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: mov h1, v2.h[7] ; CHECK-GI-NEXT: str h1, [sp, #160] // 2-byte Folded Spill ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h15 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #192] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h15 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h8 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #128] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h8 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h9 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #224] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h9 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h10 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #208] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h10 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h11 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #176] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h11 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h12 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #144] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h12 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h13 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp, #112] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h13 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 @@ -2459,46 +2406,39 @@ define <16 x half> @cos_v16f16(<16 x half> %a) { ; CHECK-GI-NEXT: str q0, [sp, #80] // 16-byte Folded Spill ; CHECK-GI-NEXT: fmov s0, s1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: fcvt s1, h14 ; CHECK-GI-NEXT: fcvt h0, s0 ; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h14 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #16] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #32] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #48] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #48] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #64] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #64] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #96] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #96] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: fcvt h0, s0 -; CHECK-GI-NEXT: fcvt s1, h1 +; CHECK-GI-NEXT: ldr h1, [sp, #160] // 2-byte Folded Reload ; CHECK-GI-NEXT: str q0, [sp, #160] // 16-byte Folded Spill -; CHECK-GI-NEXT: fmov s0, s1 +; CHECK-GI-NEXT: fcvt s0, h1 ; CHECK-GI-NEXT: bl cosf ; CHECK-GI-NEXT: ldr q3, [sp, #192] // 16-byte Folded Reload ; CHECK-GI-NEXT: ldr q2, [sp, #128] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir index 8e29255189bf5..5465b2ef214e4 100644 --- a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir +++ b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir @@ -16,8 +16,8 @@ body: | ; CHECK-LABEL: name: 1-ldrwpre-ldrwui-merge ; CHECK: liveins: $w0, $w1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $w0, renamable $w2 = LDPWpre renamable $x1, 5 :: (load (s32)) - ; CHECK-NEXT: STPWi renamable $w0, killed renamable $w2, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: early-clobber $x1, renamable $w0, renamable $w2 = LDPWpre killed renamable $x1, 5 :: (load (s32)) + ; CHECK-NEXT: STPWi killed renamable $w0, killed renamable $w2, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 20 :: (load (s32)) renamable $w2 = LDRWui renamable $x1, 1 :: (load (s32)) @@ -42,8 +42,8 @@ body: | ; CHECK-LABEL: name: 2-ldrxpre-ldrxui-merge ; CHECK: liveins: $x1, $x2, $x3 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $x2, renamable $x3 = LDPXpre renamable $x1, 3 :: (load (s64)) - ; CHECK-NEXT: STPXi renamable $x2, renamable $x3, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber $x1, renamable $x2, renamable $x3 = LDPXpre killed renamable $x1, 3 :: (load (s64)) + ; CHECK-NEXT: STPXi killed renamable $x2, killed renamable $x3, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x2 = LDRXpre killed renamable $x1, 24 :: (load (s64)) renamable $x3 = LDRXui renamable $x1, 1 :: (load (s64)) @@ -68,8 +68,8 @@ body: | ; CHECK-LABEL: name: 3-ldrspre-ldrsui-merge ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 3 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre killed renamable $x1, 3 :: (load (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) @@ -94,8 +94,8 @@ body: | ; CHECK-LABEL: name: 4-ldrqdre-ldrdui-merge ; CHECK: liveins: $d0, $d1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $d0, renamable $d1 = LDPDpre renamable $x1, 16 :: (load (s64)) - ; CHECK-NEXT: STPDi renamable $d0, renamable $d1, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber $x1, renamable $d0, renamable $d1 = LDPDpre killed renamable $x1, 16 :: (load (s64)) + ; CHECK-NEXT: STPDi killed renamable $d0, killed renamable $d1, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $d0 = LDRDpre killed renamable $x1, 128 :: (load (s64)) renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64)) @@ -124,8 +124,8 @@ body: | ; CHECK-LABEL: name: 5-ldrqpre-ldrqui-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 3 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 3 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128)) renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128)) @@ -155,8 +155,8 @@ body: | ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 48, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128)) early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128)) @@ -185,8 +185,8 @@ body: | ; CHECK-LABEL: name: 7-ldrqpre-ldrqui-max-offset-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 15 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 15 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 240 :: (load (s128)) renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128)) @@ -215,8 +215,8 @@ body: | ; CHECK-LABEL: name: 8-ldrqpre-ldrqui-min-offset-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, -16 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, -16 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, -256 :: (load (s128)) renamable $q1 = LDRQui renamable $x1, 1 :: (load (s128)) @@ -246,10 +246,10 @@ body: | ; CHECK-LABEL: name: 9-ldrspre-ldrsui-mod-base-reg-no-merge ; CHECK: liveins: $s0, $s1, $x0, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: dead early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: renamable $x1 = LDRXui renamable $x0, 1 :: (load (s64)) + ; CHECK-NEXT: dead early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: renamable $x1 = LDRXui killed renamable $x0, 1 :: (load (s64)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $x1 = LDRXui renamable $x0, 1 :: (load (s64)) @@ -280,11 +280,11 @@ body: | ; CHECK-LABEL: name: 10-ldrspre-ldrsui-used-base-reg-no-merge ; CHECK: liveins: $s0, $s1, $x0, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64)) - ; CHECK-NEXT: STRXui renamable $x0, renamable $x0, 1 :: (store (s64)) + ; CHECK-NEXT: STRXui killed renamable $x0, renamable $x0, 1 :: (store (s64)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64)) @@ -315,13 +315,13 @@ body: | ; CHECK-LABEL: name: 11-ldrqpre-ldrqpre-no-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre renamable $x1, 48, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre renamable $x1, 1, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre renamable $x1, 12, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q1 = LDRQpre renamable $x1, 16, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre killed renamable $x1, 48, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre killed renamable $x1, 1, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q0 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $q1 = LDRQpre killed renamable $x1, 12, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q1 = LDRQpre killed renamable $x1, 16, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 48 :: (load (s128)) early-clobber renamable $x1, renamable $q1 = LDRQpre killed renamable $x1, 1 :: (load (s128)) @@ -352,9 +352,9 @@ body: | ; CHECK-LABEL: name: 12-ldrspre-ldrsui-no-merge ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32)) @@ -383,10 +383,10 @@ body: | ; CHECK-LABEL: name: 13-ldrqpre-ldrdui-no-merge ; CHECK: liveins: $d1, $q0, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128)) ; CHECK-NEXT: renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64)) - ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128)) - ; CHECK-NEXT: STRDui renamable $d1, renamable $x1, 1 :: (store (s64)) + ; CHECK-NEXT: STRQui killed renamable $q0, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: STRDui killed renamable $d1, killed renamable $x1, 1 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64)) @@ -415,8 +415,8 @@ body: | ; CHECK-LABEL: name: 14-ldrqpre-strqui-no-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: STRQui killed renamable $q0, killed renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) STRQui killed renamable $q0, renamable $x1, 0 :: (store (s128)) @@ -443,8 +443,8 @@ body: | ; CHECK-LABEL: name: 15-ldrqpre-ldrqui-same-dst-reg-no-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, dead $q2, renamable $q0 = LDPQpre renamable $x1, 2 :: (load (s128)) - ; CHECK-NEXT: STRQui renamable $q0, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber $x1, dead $q2, renamable $q0 = LDPQpre killed renamable $x1, 2 :: (load (s128)) + ; CHECK-NEXT: STRQui killed renamable $q0, killed renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) renamable $q0 = LDRQui renamable $x1, 1 :: (load (s128)) @@ -473,9 +473,9 @@ body: | ; CHECK-LABEL: name: 16-ldrqpre-ldrqui-diff-base-reg-no-merge ; CHECK: liveins: $q0, $q1, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128)) - ; CHECK-NEXT: renamable $q1 = LDRQui renamable $x2, 1 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: renamable $q1 = LDRQui killed renamable $x2, 1 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) renamable $q1 = LDRQui renamable $x2, 1 :: (load (s128)) @@ -504,8 +504,8 @@ body: | ; CHECK-LABEL: name: 17-ldrqpre-ldurqi-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre renamable $x1, 2 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: early-clobber $x1, renamable $q0, renamable $q1 = LDPQpre killed renamable $x1, 2 :: (load (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) renamable $q1 = LDURQi renamable $x1, 16 :: (load (s128)) @@ -534,9 +534,9 @@ body: | ; CHECK-LABEL: name: 18-ldrqpre-ldurqi-no-merge ; CHECK: liveins: $q0, $q1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre renamable $x1, 32, implicit $w1 :: (load (s128)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32, implicit $w1 :: (load (s128)) ; CHECK-NEXT: renamable $q1 = LDURQi renamable $x1, 1 :: (load (s128)) - ; CHECK-NEXT: STPQi renamable $q0, renamable $q1, renamable $x1, 0 :: (store (s128)) + ; CHECK-NEXT: STPQi killed renamable $q0, killed renamable $q1, renamable $x1, 0 :: (store (s128)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $q0 = LDRQpre killed renamable $x1, 32 :: (load (s128)) renamable $q1 = LDURQi renamable $x1, 1 :: (load (s128)) @@ -561,8 +561,8 @@ body: | ; CHECK-LABEL: name: 19-ldrspre-ldrsui-max-merge ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 63 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre killed renamable $x1, 63 :: (load (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 252 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) @@ -587,9 +587,9 @@ body: | ; CHECK-LABEL: name: 20-ldrspre-ldrsui-unaligned-no-merge ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 251, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 251, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) + ; CHECK-NEXT: STPSi killed renamable $s0, killed renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 251 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) @@ -614,8 +614,8 @@ body: | ; CHECK-LABEL: name: 21-ldrswpre-ldrswui-merge ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre renamable $x1, 10 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre killed renamable $x1, 10 :: (load (s32)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32)) @@ -640,8 +640,8 @@ body: | ; CHECK-LABEL: name: 22-ldrswpre-ldurswi-merge ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre renamable $x1, 10 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber $x1, renamable $x0, renamable $x2 = LDPSWpre killed renamable $x1, 10 :: (load (s32)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32)) @@ -667,8 +667,8 @@ body: | ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32)) early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) @@ -694,8 +694,8 @@ body: | ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr renamable $x2 = LDURSWi renamable $x1, 4 :: (load (s32)) early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) @@ -720,13 +720,13 @@ body: | ; CHECK-LABEL: name: 25-ldrswpre-ldrswpre-no-merge ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre renamable $x1, 48, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre renamable $x1, 1, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre renamable $x1, 12, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: early-clobber renamable $x1, renamable $x2 = LDRSWpre renamable $x1, 16, implicit $w1 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre killed renamable $x1, 48, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre killed renamable $x1, 1, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x0 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, dead renamable $x2 = LDRSWpre killed renamable $x1, 12, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $x2 = LDRSWpre killed renamable $x1, 16, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 48 :: (load (s32)) early-clobber renamable $x1, renamable $x2 = LDRSWpre killed renamable $x1, 1 :: (load (s32)) @@ -755,9 +755,9 @@ body: | ; CHECK-LABEL: name: 26-ldrswpre-ldrwui-no-merge ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $w2 = LDRWui renamable $x1, 1, implicit-def $x2 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) renamable $w2 = LDRWui renamable $x1, 1 :: (load (s32)) @@ -782,9 +782,9 @@ body: | ; CHECK-LABEL: name: 27-ldrwpre-ldrswui-no-merge ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x1, renamable $w0 = LDRWpre renamable $x1, 40, implicit $w1 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 40, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) + ; CHECK-NEXT: STPXi killed renamable $x0, killed renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 40 :: (load (s32)) renamable $x2 = LDRSWui renamable $x1, 1 :: (load (s32)) @@ -808,11 +808,11 @@ body: | ; CHECK-LABEL: name: 28-ldrswpre-ldrwpre-no-merge ; CHECK: liveins: $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre renamable $x11, 8, implicit $w11 :: (load (s32), align 8) + ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre killed renamable $x11, 8, implicit $w11 :: (load (s32), align 8) ; CHECK-NEXT: $x14 = EORXrs renamable $x11, renamable $x13, 0 - ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre renamable $x11, 4, implicit $w11 :: (load (s32)) - ; CHECK-NEXT: $x13 = EORXrs renamable $x11, renamable $x13, 0 - ; CHECK-NEXT: STPXi renamable $x13, renamable $x14, renamable $x11, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre killed renamable $x11, 4, implicit $w11 :: (load (s32)) + ; CHECK-NEXT: $x13 = EORXrs renamable $x11, killed renamable $x13, 0 + ; CHECK-NEXT: STPXi killed renamable $x13, killed renamable $x14, renamable $x11, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x11, renamable $x10 = LDRSWpre killed renamable $x11, 8 :: (load (s32), align 8) $x14 = EORXrs renamable $x11, renamable $x13, 0 @@ -838,11 +838,11 @@ body: | ; CHECK-LABEL: name: 29-ldrwpre-ldrswpre-no-merge ; CHECK: liveins: $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre renamable $x11, 8, implicit $w11 :: (load (s32)) + ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $w12 = LDRWpre killed renamable $x11, 8, implicit $w11 :: (load (s32)) ; CHECK-NEXT: $x14 = EORXrs renamable $x11, renamable $x13, 0 - ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre renamable $x11, 4, implicit $w11 :: (load (s32), align 8) - ; CHECK-NEXT: $x13 = EORXrs renamable $x11, renamable $x13, 0 - ; CHECK-NEXT: STPXi renamable $x13, renamable $x14, renamable $x11, 0 :: (store (s64)) + ; CHECK-NEXT: early-clobber renamable $x11, dead renamable $x10 = LDRSWpre killed renamable $x11, 4, implicit $w11 :: (load (s32), align 8) + ; CHECK-NEXT: $x13 = EORXrs renamable $x11, killed renamable $x13, 0 + ; CHECK-NEXT: STPXi killed renamable $x13, killed renamable $x14, renamable $x11, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x11, renamable $w12 = LDRWpre killed renamable $x11, 8 :: (load (s32)) $x14 = EORXrs renamable $x11, renamable $x13, 0 diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index 51d17ad0644f1..a09fb07d00d97 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -104,10 +104,9 @@ define <2 x half> @exp10_v2f16(<2 x half> %x) { ; GISEL-NEXT: mov h8, v0.h[1] ; GISEL-NEXT: fcvt s0, h0 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h8 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h8 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: fcvt h1, s0 ; GISEL-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -177,15 +176,13 @@ define <3 x half> @exp10_v3f16(<3 x half> %x) { ; GISEL-NEXT: mov h9, v0.h[2] ; GISEL-NEXT: fcvt s0, h0 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h8 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h8 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h9 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h9 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload ; GISEL-NEXT: fcvt h0, s0 @@ -260,20 +257,17 @@ define <4 x half> @exp10_v4f16(<4 x half> %x) { ; GISEL-NEXT: mov h10, v0.h[3] ; GISEL-NEXT: fcvt s0, h0 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h8 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp, #32] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h8 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h9 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h9 ; GISEL-NEXT: bl exp10f -; GISEL-NEXT: fcvt s1, h10 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill -; GISEL-NEXT: fmov s0, s1 +; GISEL-NEXT: fcvt s0, h10 ; GISEL-NEXT: bl exp10f ; GISEL-NEXT: ldp q2, q1, [sp, #16] // 32-byte Folded Reload ; GISEL-NEXT: fcvt h0, s0 diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll index c3c0ec5e3d9d8..fad28d325b7d1 100644 --- a/llvm/test/CodeGen/AArch64/load.ll +++ b/llvm/test/CodeGen/AArch64/load.ll @@ -216,10 +216,9 @@ define <3 x i8> @load_v3i8(ptr %ptr){ ; ; CHECK-GI-LABEL: load_v3i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: ldrb w8, [x0] ; CHECK-GI-NEXT: ldrb w1, [x0, #1] ; CHECK-GI-NEXT: ldrb w2, [x0, #2] -; CHECK-GI-NEXT: mov w0, w8 +; CHECK-GI-NEXT: ldrb w0, [x0] ; CHECK-GI-NEXT: ret %a = load <3 x i8>, ptr %ptr ret <3 x i8> %a diff --git a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll index f394bc467af69..a2b6f45a8fa41 100644 --- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll +++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll @@ -11,7 +11,7 @@ define i32 @check_lr_liveness(ptr %arg) #1 { ; CHECK-NEXT: successors: %bb.4(0x20000000), %bb.1(0x60000000) ; CHECK-NEXT: liveins: $x0, $lr ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $x8 = COPY $x0 + ; CHECK-NEXT: renamable $x8 = COPY killed $x0 ; CHECK-NEXT: renamable $w0 = MOVi32imm -536870206 ; CHECK-NEXT: CBNZX killed renamable $x8, %bb.1 ; CHECK-NEXT: {{ $}} @@ -38,7 +38,7 @@ define i32 @check_lr_liveness(ptr %arg) #1 { ; CHECK-NEXT: bb.3.bb2: ; CHECK-NEXT: liveins: $w0, $lr ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: RET_ReallyLR implicit killed $w0 bb: %icmp = icmp eq ptr %arg, null %or = or i1 %icmp, false diff --git a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir index 5b379c2bd5629..e087f0eca7585 100644 --- a/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir +++ b/llvm/test/CodeGen/AArch64/machine-cp-sub-reg.mir @@ -10,13 +10,13 @@ body: | ; CHECK-NEXT: successors: %bb.1(0x80000000) ; CHECK-NEXT: liveins: $w0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $w8 = ORRWrs $wzr, $w0, 0, implicit-def $x8 + ; CHECK-NEXT: $w8 = ORRWrs $wzr, killed $w0, 0, implicit-def $x8 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: liveins: $x8 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x0 = ADDXri $x8, 1, 0 - ; CHECK-NEXT: RET undef $lr, implicit $x0 + ; CHECK-NEXT: $x0 = ADDXri killed $x8, 1, 0 + ; CHECK-NEXT: RET undef $lr, implicit killed $x0 bb.0: successors: %bb.1(0x80000000) liveins: $w0 diff --git a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll index e7e109170d6a1..338084295fc7f 100644 --- a/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll +++ b/llvm/test/CodeGen/AArch64/machine-sink-kill-flags.ll @@ -16,13 +16,12 @@ define i32 @test(ptr %ptr) { ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: LBB0_1: ; %.thread ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lsr w11, w9, #1 ; CHECK-NEXT: sub w10, w9, #1 -; CHECK-NEXT: mov w9, w11 +; CHECK-NEXT: lsr w9, w9, #1 ; CHECK-NEXT: tbnz w10, #0, LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %bb343 ; CHECK-NEXT: and w9, w10, #0x1 -; CHECK-NEXT: mov w0, #-1 +; CHECK-NEXT: mov w0, #-1 ; =0xffffffff ; CHECK-NEXT: str w9, [x8] ; CHECK-NEXT: ret bb: diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll index f2e62bc4f3c8c..dda694dd2b317 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-neon.ll @@ -50,11 +50,10 @@ define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 { define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 { ; CHECK-LABEL: splice_v16f32_idx: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v6.16b, v3.16b, v4.16b, #12 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 ; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 ; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.16b, v6.16b ; CHECK-NEXT: ret %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7) ret <16 x float> %res @@ -107,11 +106,10 @@ define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 { define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 { ; CHECK-LABEL: splice_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v6.16b, v3.16b, v4.16b, #12 ; CHECK-NEXT: ext v0.16b, v1.16b, v2.16b, #12 ; CHECK-NEXT: ext v1.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v4.16b, #12 ; CHECK-NEXT: ext v3.16b, v4.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.16b, v6.16b ; CHECK-NEXT: ret %res = call <16 x float> @llvm.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9) ret <16 x float> %res diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 6f4b090fb22bd..9550c394c167d 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -71,22 +71,19 @@ entry: define <32 x i16> @extadds_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) { ; CHECK-SD-LABEL: extadds_v32i8_i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddl2 v4.8h, v1.16b, v3.16b -; CHECK-SD-NEXT: saddl v5.8h, v0.8b, v2.8b ; CHECK-SD-NEXT: saddl2 v6.8h, v0.16b, v2.16b +; CHECK-SD-NEXT: saddl v0.8h, v0.8b, v2.8b ; CHECK-SD-NEXT: saddl v2.8h, v1.8b, v3.8b -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: saddl2 v3.8h, v1.16b, v3.16b ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extadds_v32i8_i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddl v4.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: saddl2 v5.8h, v0.16b, v2.16b +; CHECK-GI-NEXT: saddl v0.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: saddl v2.8h, v1.8b, v3.8b ; CHECK-GI-NEXT: saddl2 v3.8h, v1.16b, v3.16b -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -99,22 +96,19 @@ entry: define <32 x i16> @extaddu_v32i8_i16(<32 x i8> %s0, <32 x i8> %s1) { ; CHECK-SD-LABEL: extaddu_v32i8_i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: uaddl2 v4.8h, v1.16b, v3.16b -; CHECK-SD-NEXT: uaddl v5.8h, v0.8b, v2.8b ; CHECK-SD-NEXT: uaddl2 v6.8h, v0.16b, v2.16b +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v2.8b ; CHECK-SD-NEXT: uaddl v2.8h, v1.8b, v3.8b -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: uaddl2 v3.8h, v1.16b, v3.16b ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extaddu_v32i8_i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddl v4.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: uaddl2 v5.8h, v0.16b, v2.16b +; CHECK-GI-NEXT: uaddl v0.8h, v0.8b, v2.8b ; CHECK-GI-NEXT: uaddl v2.8h, v1.8b, v3.8b ; CHECK-GI-NEXT: uaddl2 v3.8h, v1.16b, v3.16b -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -821,22 +815,19 @@ entry: define <16 x i32> @extadds_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) { ; CHECK-SD-LABEL: extadds_v16i16_i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddl2 v4.4s, v1.8h, v3.8h -; CHECK-SD-NEXT: saddl v5.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: saddl2 v6.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: saddl v0.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: saddl v2.4s, v1.4h, v3.4h -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: saddl2 v3.4s, v1.8h, v3.8h ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extadds_v16i16_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddl v4.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: saddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: saddl v0.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: saddl v2.4s, v1.4h, v3.4h ; CHECK-GI-NEXT: saddl2 v3.4s, v1.8h, v3.8h -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -849,22 +840,19 @@ entry: define <16 x i32> @extaddu_v16i16_i32(<16 x i16> %s0, <16 x i16> %s1) { ; CHECK-SD-LABEL: extaddu_v16i16_i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: uaddl2 v4.4s, v1.8h, v3.8h -; CHECK-SD-NEXT: uaddl v5.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: uaddl2 v6.4s, v0.8h, v2.8h +; CHECK-SD-NEXT: uaddl v0.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: uaddl v2.4s, v1.4h, v3.4h -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: uaddl2 v3.4s, v1.8h, v3.8h ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extaddu_v16i16_i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddl v4.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: uaddl2 v5.4s, v0.8h, v2.8h +; CHECK-GI-NEXT: uaddl v0.4s, v0.4h, v2.4h ; CHECK-GI-NEXT: uaddl v2.4s, v1.4h, v3.4h ; CHECK-GI-NEXT: uaddl2 v3.4s, v1.8h, v3.8h -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -1093,22 +1081,19 @@ entry: define <8 x i64> @extadds_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) { ; CHECK-SD-LABEL: extadds_v8i32_i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: saddl2 v4.2d, v1.4s, v3.4s -; CHECK-SD-NEXT: saddl v5.2d, v0.2s, v2.2s ; CHECK-SD-NEXT: saddl2 v6.2d, v0.4s, v2.4s +; CHECK-SD-NEXT: saddl v0.2d, v0.2s, v2.2s ; CHECK-SD-NEXT: saddl v2.2d, v1.2s, v3.2s -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: saddl2 v3.2d, v1.4s, v3.4s ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extadds_v8i32_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: saddl v4.2d, v0.2s, v2.2s ; CHECK-GI-NEXT: saddl2 v5.2d, v0.4s, v2.4s +; CHECK-GI-NEXT: saddl v0.2d, v0.2s, v2.2s ; CHECK-GI-NEXT: saddl v2.2d, v1.2s, v3.2s ; CHECK-GI-NEXT: saddl2 v3.2d, v1.4s, v3.4s -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -1121,22 +1106,19 @@ entry: define <8 x i64> @extaddu_v8i32_i64(<8 x i32> %s0, <8 x i32> %s1) { ; CHECK-SD-LABEL: extaddu_v8i32_i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: uaddl2 v4.2d, v1.4s, v3.4s -; CHECK-SD-NEXT: uaddl v5.2d, v0.2s, v2.2s ; CHECK-SD-NEXT: uaddl2 v6.2d, v0.4s, v2.4s +; CHECK-SD-NEXT: uaddl v0.2d, v0.2s, v2.2s ; CHECK-SD-NEXT: uaddl v2.2d, v1.2s, v3.2s -; CHECK-SD-NEXT: mov v0.16b, v5.16b +; CHECK-SD-NEXT: uaddl2 v3.2d, v1.4s, v3.4s ; CHECK-SD-NEXT: mov v1.16b, v6.16b -; CHECK-SD-NEXT: mov v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extaddu_v8i32_i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: uaddl v4.2d, v0.2s, v2.2s ; CHECK-GI-NEXT: uaddl2 v5.2d, v0.4s, v2.4s +; CHECK-GI-NEXT: uaddl v0.2d, v0.2s, v2.2s ; CHECK-GI-NEXT: uaddl v2.2d, v1.2s, v3.2s ; CHECK-GI-NEXT: uaddl2 v3.2d, v1.4s, v3.4s -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-extmul.ll b/llvm/test/CodeGen/AArch64/neon-extmul.ll index 3dbc033dfab96..cdd04eb3fa9c3 100644 --- a/llvm/test/CodeGen/AArch64/neon-extmul.ll +++ b/llvm/test/CodeGen/AArch64/neon-extmul.ll @@ -296,13 +296,12 @@ define <8 x i64> @extmuladds_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) ; CHECK-SD-LABEL: extmuladds_v8i8_i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: smull v0.8h, v0.8b, v1.8b -; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0 ; CHECK-SD-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-SD-NEXT: saddw2 v5.2d, v5.2d, v6.4s +; CHECK-SD-NEXT: sshll2 v6.4s, v0.8h, #0 ; CHECK-SD-NEXT: saddw v0.2d, v2.2d, v1.2s ; CHECK-SD-NEXT: saddw2 v1.2d, v3.2d, v1.4s +; CHECK-SD-NEXT: saddw2 v3.2d, v5.2d, v6.4s ; CHECK-SD-NEXT: saddw v2.2d, v4.2d, v6.2s -; CHECK-SD-NEXT: mov v3.16b, v5.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extmuladds_v8i8_i64: @@ -334,13 +333,12 @@ define <8 x i64> @extmuladdu_v8i8_i64(<8 x i8> %s0, <8 x i8> %s1, <8 x i64> %b) ; CHECK-SD-LABEL: extmuladdu_v8i8_i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: umull v0.8h, v0.8b, v1.8b -; CHECK-SD-NEXT: ushll2 v6.4s, v0.8h, #0 ; CHECK-SD-NEXT: ushll v1.4s, v0.4h, #0 -; CHECK-SD-NEXT: uaddw2 v5.2d, v5.2d, v6.4s +; CHECK-SD-NEXT: ushll2 v6.4s, v0.8h, #0 ; CHECK-SD-NEXT: uaddw v0.2d, v2.2d, v1.2s ; CHECK-SD-NEXT: uaddw2 v1.2d, v3.2d, v1.4s +; CHECK-SD-NEXT: uaddw2 v3.2d, v5.2d, v6.4s ; CHECK-SD-NEXT: uaddw v2.2d, v4.2d, v6.2s -; CHECK-SD-NEXT: mov v3.16b, v5.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: extmuladdu_v8i8_i64: diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll index 15763543113eb..c93a8c14cf8f2 100644 --- a/llvm/test/CodeGen/AArch64/neon-perm.ll +++ b/llvm/test/CodeGen/AArch64/neon-perm.ll @@ -4092,10 +4092,9 @@ entry: define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) { ; CHECK-SD-LABEL: test_uzp: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: xtn v2.8b, v0.8h ; CHECK-SD-NEXT: uzp2 v1.16b, v0.16b, v0.16b +; CHECK-SD-NEXT: xtn v0.8b, v0.8h ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 -; CHECK-SD-NEXT: fmov d0, d2 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: test_uzp: diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 529a3b72e0971..7d7b862098879 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -337,9 +337,9 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-LABEL: sext_v3i16_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: smov x8, v0.h[0] ; CHECK-GI-NEXT: smov x9, v0.h[1] ; CHECK-GI-NEXT: smov x10, v0.h[2] +; CHECK-GI-NEXT: smov x8, v0.h[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -362,9 +362,9 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: sext_v3i32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: smov x9, v0.s[1] ; CHECK-GI-NEXT: smov x10, v0.s[2] +; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -547,18 +547,11 @@ entry: } define <4 x i64> @sext_v4i32_v4i64(<4 x i32> %a) { -; CHECK-SD-LABEL: sext_v4i32_v4i64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sext_v4i32_v4i64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.2d, v0.2s, #0 -; CHECK-GI-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sext_v4i32_v4i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret entry: %c = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %c @@ -671,18 +664,11 @@ entry: } define <8 x i32> @sext_v8i16_v8i32(<8 x i16> %a) { -; CHECK-SD-LABEL: sext_v8i16_v8i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sext_v8i16_v8i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sext_v8i16_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret entry: %c = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %c @@ -716,21 +702,19 @@ entry: define <8 x i64> @sext_v8i32_v8i64(<8 x i32> %a) { ; CHECK-SD-LABEL: sext_v8i32_v8i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v5.2d, v0.2s, #0 ; CHECK-SD-NEXT: sshll2 v4.2d, v0.4s, #0 ; CHECK-SD-NEXT: sshll2 v3.2d, v1.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: sshll v2.2d, v1.2s, #0 -; CHECK-SD-NEXT: mov v0.16b, v5.16b ; CHECK-SD-NEXT: mov v1.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v8i32_v8i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll2 v5.2d, v0.4s, #0 ; CHECK-GI-NEXT: sshll v2.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll2 v3.2d, v1.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -816,18 +800,11 @@ entry: } define <16 x i16> @sext_v16i8_v16i16(<16 x i8> %a) { -; CHECK-SD-LABEL: sext_v16i8_v16i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v1.8h, v0.16b, #0 -; CHECK-SD-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sext_v16i8_v16i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v1.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sext_v16i8_v16i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sshll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret entry: %c = sext <16 x i8> %a to <16 x i16> ret <16 x i16> %c @@ -902,21 +879,19 @@ entry: define <16 x i32> @sext_v16i16_v16i32(<16 x i16> %a) { ; CHECK-SD-LABEL: sext_v16i16_v16i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v5.4s, v0.4h, #0 ; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0 ; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-SD-NEXT: mov v0.16b, v5.16b ; CHECK-SD-NEXT: mov v1.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v16i16_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v4.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 ; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -964,30 +939,28 @@ entry: define <16 x i64> @sext_v16i32_v16i64(<16 x i32> %a) { ; CHECK-SD-LABEL: sext_v16i32_v16i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v17.2d, v0.4s, #0 -; CHECK-SD-NEXT: sshll2 v16.2d, v1.4s, #0 ; CHECK-SD-NEXT: sshll v18.2d, v1.2s, #0 +; CHECK-SD-NEXT: sshll2 v16.2d, v1.4s, #0 ; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: sshll2 v7.2d, v3.4s, #0 ; CHECK-SD-NEXT: sshll v4.2d, v2.2s, #0 ; CHECK-SD-NEXT: sshll2 v5.2d, v2.4s, #0 -; CHECK-SD-NEXT: sshll2 v7.2d, v3.4s, #0 ; CHECK-SD-NEXT: sshll v6.2d, v3.2s, #0 -; CHECK-SD-NEXT: mov v1.16b, v17.16b +; CHECK-SD-NEXT: sshll2 v1.2d, v0.4s, #0 ; CHECK-SD-NEXT: mov v2.16b, v18.16b ; CHECK-SD-NEXT: mov v3.16b, v16.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v16i32_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v16.2d, v0.2s, #0 +; CHECK-GI-NEXT: sshll2 v19.2d, v1.4s, #0 ; CHECK-GI-NEXT: sshll2 v17.2d, v0.4s, #0 ; CHECK-GI-NEXT: sshll v18.2d, v1.2s, #0 -; CHECK-GI-NEXT: sshll2 v19.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll v4.2d, v2.2s, #0 -; CHECK-GI-NEXT: sshll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: sshll v6.2d, v3.2s, #0 ; CHECK-GI-NEXT: sshll2 v7.2d, v3.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v16.16b +; CHECK-GI-NEXT: sshll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: mov v1.16b, v17.16b ; CHECK-GI-NEXT: mov v2.16b, v18.16b ; CHECK-GI-NEXT: mov v3.16b, v19.16b diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll index b1131f287fe9a..56ff1c88e34f6 100644 --- a/llvm/test/CodeGen/AArch64/shufflevector.ll +++ b/llvm/test/CodeGen/AArch64/shufflevector.ll @@ -355,18 +355,11 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { } define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-SD-LABEL: shufflevector_v4i64: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: zip2 v2.2d, v2.2d, v3.2d -; CHECK-SD-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-SD-NEXT: mov v1.16b, v2.16b -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: shufflevector_v4i64: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: zip2 v0.2d, v0.2d, v1.2d -; CHECK-GI-NEXT: zip2 v1.2d, v2.2d, v3.2d -; CHECK-GI-NEXT: ret +; CHECK-LABEL: shufflevector_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d +; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d +; CHECK-NEXT: ret %c = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %c } diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir index 11cf388e38531..8df159da3c3bf 100644 --- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s # RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND --- | @@ -34,14 +35,7 @@ body: | bb.0.entry: liveins: $p0 - ; CHECK-LABEL: name: spills_fills_stack_id_ppr - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_ppr - ; EXPAND: STR_PXI $p0, $sp, 7 - ; EXPAND: $p0 = LDR_PXI $sp, 7 %0:ppr = COPY $p0 @@ -77,16 +71,7 @@ body: | bb.0.entry: liveins: $p0_p1 - ; CHECK-LABEL: name: spills_fills_stack_id_ppr2 - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2 - ; EXPAND: STR_PXI $p0, $sp, 6 - ; EXPAND: STR_PXI $p1, $sp, 7 - ; EXPAND: $p0 = LDR_PXI $sp, 6 - ; EXPAND: $p1 = LDR_PXI $sp, 7 %0:ppr2 = COPY $p0_p1 @@ -122,16 +107,7 @@ body: | bb.0.entry: liveins: $p0_p1 - ; CHECK-LABEL: name: spills_fills_stack_id_ppr2 - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2 - ; EXPAND: STR_PXI $p0, $sp, 6 - ; EXPAND: STR_PXI $p1, $sp, 7 - ; EXPAND: $p0 = LDR_PXI $sp, 6 - ; EXPAND: $p1 = LDR_PXI $sp, 7 %0:ppr2mul2 = COPY $p0_p1 @@ -167,14 +143,7 @@ body: | bb.0.entry: liveins: $pn0 - ; CHECK-LABEL: name: spills_fills_stack_id_pnr - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_pnr - ; EXPAND: STR_PXI $pn0, $sp, 7 - ; EXPAND: $pn0 = LDR_PXI $sp, 7, implicit-def $pn0 %0:pnr = COPY $pn0 @@ -206,17 +175,7 @@ registers: stack: body: | bb.0.entry: - ; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr - ; EXPAND: renamable $pn8 = WHILEGE_CXX_B - ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7 - ; - ; EXPAND: renamable $pn8 = LDR_PXI $sp, 7 - ; EXPAND: $p0 = PEXT_PCI_B killed renamable $pn8, 0 %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv @@ -253,14 +212,7 @@ body: | bb.0.entry: liveins: $z0 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 16, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 %0:zpr = COPY $z0 @@ -288,16 +240,7 @@ body: | bb.0.entry: liveins: $z0_z1 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr2 - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2 - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: STR_ZXI $z1, $sp, 1 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 - ; EXPAND: $z1 = LDR_ZXI $sp, 1 %0:zpr2 = COPY $z0_z1 @@ -333,16 +276,7 @@ body: | bb.1: liveins: $z0_z8 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr2strided - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2strided - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: STR_ZXI $z8, $sp, 1 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 - ; EXPAND: $z8 = LDR_ZXI $sp, 1 %0:zpr2strided = COPY $z0_z8 @@ -370,18 +304,7 @@ body: | bb.0.entry: liveins: $z0_z1_z2 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr3 - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 48, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr3 - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: STR_ZXI $z1, $sp, 1 - ; EXPAND: STR_ZXI $z2, $sp, 2 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 - ; EXPAND: $z1 = LDR_ZXI $sp, 1 - ; EXPAND: $z2 = LDR_ZXI $sp, 2 %0:zpr3 = COPY $z0_z1_z2 @@ -409,20 +332,7 @@ body: | bb.0.entry: liveins: $z0_z1_z2_z3 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr4 - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector - - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4 - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: STR_ZXI $z1, $sp, 1 - ; EXPAND: STR_ZXI $z2, $sp, 2 - ; EXPAND: STR_ZXI $z3, $sp, 3 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 - ; EXPAND: $z1 = LDR_ZXI $sp, 1 - ; EXPAND: $z2 = LDR_ZXI $sp, 2 - ; EXPAND: $z3 = LDR_ZXI $sp, 3 + %0:zpr4 = COPY $z0_z1_z2_z3 @@ -457,20 +367,7 @@ body: | bb.1: liveins: $z0_z4_z8_z12 - ; CHECK-LABEL: name: spills_fills_stack_id_zpr4strided - ; CHECK: stack: - ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16 - ; CHECK-NEXT: stack-id: scalable-vector - - ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4strided - ; EXPAND: STR_ZXI $z0, $sp, 0 - ; EXPAND: STR_ZXI $z4, $sp, 1 - ; EXPAND: STR_ZXI $z8, $sp, 2 - ; EXPAND: STR_ZXI $z12, $sp, 3 - ; EXPAND: $z0 = LDR_ZXI $sp, 0 - ; EXPAND: $z4 = LDR_ZXI $sp, 1 - ; EXPAND: $z8 = LDR_ZXI $sp, 2 - ; EXPAND: $z12 = LDR_ZXI $sp, 3 + %0:zpr4strided = COPY $z0_z4_z8_z12 @@ -486,3 +383,6 @@ body: | $z0_z4_z8_z12 = COPY %0 RET_ReallyLR ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} +# EXPAND: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll index 20faeb23eed59..0f8e8a8f06ce2 100644 --- a/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll +++ b/llvm/test/CodeGen/AArch64/streaming-compatible-memory-ops.ll @@ -180,16 +180,15 @@ define void @sc_memcpy(i64 noundef %n) "aarch64_pstate_sm_compatible" nounwind { ; CHECK-NO-SME-ROUTINES-NEXT: stp x30, x9, [sp, #64] // 16-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: str x19, [sp, #80] // 8-byte Folded Spill ; CHECK-NO-SME-ROUTINES-NEXT: bl __arm_sme_state -; CHECK-NO-SME-ROUTINES-NEXT: adrp x8, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: and x19, x0, #0x1 +; CHECK-NO-SME-ROUTINES-NEXT: adrp x0, :got:dst ; CHECK-NO-SME-ROUTINES-NEXT: adrp x1, :got:src -; CHECK-NO-SME-ROUTINES-NEXT: ldr x8, [x8, :got_lo12:dst] +; CHECK-NO-SME-ROUTINES-NEXT: ldr x0, [x0, :got_lo12:dst] ; CHECK-NO-SME-ROUTINES-NEXT: ldr x1, [x1, :got_lo12:src] ; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_2 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.1: // %entry ; CHECK-NO-SME-ROUTINES-NEXT: smstop sm ; CHECK-NO-SME-ROUTINES-NEXT: .LBB3_2: // %entry -; CHECK-NO-SME-ROUTINES-NEXT: mov x0, x8 ; CHECK-NO-SME-ROUTINES-NEXT: bl memcpy ; CHECK-NO-SME-ROUTINES-NEXT: tbz w19, #0, .LBB3_4 ; CHECK-NO-SME-ROUTINES-NEXT: // %bb.3: // %entry diff --git a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll index 88e13ea1e0fa4..142fa413c7e55 100644 --- a/llvm/test/CodeGen/AArch64/sve-sext-zext.ll +++ b/llvm/test/CodeGen/AArch64/sve-sext-zext.ll @@ -192,9 +192,8 @@ define @zext_i32_i64( %a) { define @sext_b_to_h( %a) { ; CHECK-LABEL: sext_b_to_h: ; CHECK: // %bb.0: -; CHECK-NEXT: sunpklo z2.h, z0.b ; CHECK-NEXT: sunpkhi z1.h, z0.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: ret %ext = sext %a to ret %ext @@ -203,9 +202,8 @@ define @sext_b_to_h( %a) { define @sext_h_to_s( %a) { ; CHECK-LABEL: sext_h_to_s: ; CHECK: // %bb.0: -; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: sunpkhi z1.s, z0.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: ret %ext = sext %a to ret %ext @@ -214,9 +212,8 @@ define @sext_h_to_s( %a) { define @sext_s_to_d( %a) { ; CHECK-LABEL: sext_s_to_d: ; CHECK: // %bb.0: -; CHECK-NEXT: sunpklo z2.d, z0.s ; CHECK-NEXT: sunpkhi z1.d, z0.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: ret %ext = sext %a to ret %ext @@ -261,9 +258,8 @@ define @sext_b_to_d( %a) { define @zext_b_to_h( %a) { ; CHECK-LABEL: zext_b_to_h: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.h, z0.b ; CHECK-NEXT: uunpkhi z1.h, z0.b -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ret %ext = zext %a to ret %ext @@ -272,9 +268,8 @@ define @zext_b_to_h( %a) { define @zext_h_to_s( %a) { ; CHECK-LABEL: zext_h_to_s: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z1.s, z0.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ret %ext = zext %a to ret %ext @@ -283,9 +278,8 @@ define @zext_h_to_s( %a) { define @zext_s_to_d( %a) { ; CHECK-LABEL: zext_s_to_d: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z1.d, z0.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ret %ext = zext %a to ret %ext @@ -333,9 +327,8 @@ define @zext_4i8_4i64( %aval) { ; CHECK-LABEL: zext_4i8_4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z1.d, z0.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ret %aext = zext %aval to ret %aext @@ -345,9 +338,8 @@ define @zext_4i16_4i64( %aval) { ; CHECK-LABEL: zext_4i16_4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.s, z0.s, #0xffff -; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z1.d, z0.s -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ret %aext = zext %aval to ret %aext @@ -357,9 +349,8 @@ define @zext_8i8_8i32( %aval) { ; CHECK-LABEL: zext_8i8_8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and z0.h, z0.h, #0xff -; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z1.s, z0.h -; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ret %aext = zext %aval to ret %aext diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 77aaeeadcfc2f..da293ab51e73a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -3132,8 +3132,8 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { ; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! ; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] ; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] -; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] -; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] ; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll index fd1365d56fee4..eafb71a0b23a3 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -171,10 +171,9 @@ define {, } @vector_deinterleave_nxv4i64_nxv ; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z6.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z3.d, z2.d, z3.d -; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: mov z1.d, z4.d ; CHECK-NEXT: mov z2.d, z6.d ; CHECK-NEXT: ret @@ -186,20 +185,16 @@ define {, } @vector_deinterleave_nxv8i64_nx ; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: uzp1 z24.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z26.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z27.d, z6.d, z7.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d ; CHECK-NEXT: uzp2 z28.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z29.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z30.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z3.d, z6.d, z7.d ; CHECK-NEXT: uzp2 z7.d, z6.d, z7.d -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d ; CHECK-NEXT: mov z1.d, z24.d -; CHECK-NEXT: mov z2.d, z26.d -; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: mov z4.d, z28.d ; CHECK-NEXT: mov z5.d, z29.d -; CHECK-NEXT: mov z6.d, z30.d ; CHECK-NEXT: ret %retval = call {, } @llvm.vector.deinterleave2.nxv16i64( %vec) ret {, } %retval diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index e2c3b0abe21aa..fe089fa4a6417 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -166,10 +166,9 @@ define @interleave2_nxv16i32( %vec0, @llvm.vector.interleave2.nxv16i32( %vec0, %vec1) @@ -181,10 +180,9 @@ define @interleave2_nxv8i64( %vec0, @llvm.vector.interleave2.nxv8i64( %vec0, %vec1) diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index 3a481efd9785a..9ef6047b0eb8c 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -60,8 +60,7 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s ; CHECK-NEXT: st1 { v1.s }[2], [x8] ; CHECK-NEXT: str d1, [x0] -; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: cmtst v0.4s, v2.4s, v2.4s ; CHECK-NEXT: ret %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 @@ -79,8 +78,7 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v2.4s, v3.4s, v2.4s ; CHECK-NEXT: str q1, [x0] -; CHECK-NEXT: cmtst v2.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v0.16b, v2.16b +; CHECK-NEXT: cmtst v0.4s, v2.4s, v2.4s ; CHECK-NEXT: ret %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 @@ -143,15 +141,13 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: umull v5.2d, v0.2s, v2.2s ; CHECK-NEXT: umull2 v6.2d, v1.4s, v3.4s ; CHECK-NEXT: umull v7.2d, v1.2s, v3.2s -; CHECK-NEXT: mul v1.4s, v1.4s, v3.4s ; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s +; CHECK-NEXT: mul v1.4s, v1.4s, v3.4s ; CHECK-NEXT: uzp2 v4.4s, v5.4s, v4.4s ; CHECK-NEXT: uzp2 v5.4s, v7.4s, v6.4s ; CHECK-NEXT: stp q2, q1, [x0] -; CHECK-NEXT: cmtst v4.4s, v4.4s, v4.4s -; CHECK-NEXT: cmtst v5.4s, v5.4s, v5.4s -; CHECK-NEXT: mov v0.16b, v4.16b -; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: cmtst v0.4s, v4.4s, v4.4s +; CHECK-NEXT: cmtst v1.4s, v5.4s, v5.4s ; CHECK-NEXT: ret %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 54ada05c90448..150a67ab7974d 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -4819,45 +4819,47 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b ; CHECK-GI-NEXT: ldr d3, [x10] ; CHECK-GI-NEXT: ldr d4, [x11] -; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 +; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ldr d2, [x10] +; CHECK-GI-NEXT: ldr d6, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 -; CHECK-GI-NEXT: ldr d6, [x11] -; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b -; CHECK-GI-NEXT: abs v5.4s, v5.4s -; CHECK-GI-NEXT: abs v0.4s, v0.4s ; CHECK-GI-NEXT: ldr d4, [x10] ; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: abs v5.4s, v5.4s +; CHECK-GI-NEXT: abs v0.4s, v0.4s ; CHECK-GI-NEXT: abs v7.4s, v7.4s -; CHECK-GI-NEXT: abs v1.4s, v1.4s ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: abs v1.4s, v1.4s +; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b ; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b ; CHECK-GI-NEXT: ldr d6, [x10] ; CHECK-GI-NEXT: ldr d17, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b -; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-GI-NEXT: ldr d5, [x10] +; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-GI-NEXT: ldr d7, [x11] +; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b -; CHECK-GI-NEXT: ldr d17, [x11, x8] -; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 ; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b ; CHECK-GI-NEXT: ldr d7, [x10, x9] -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 +; CHECK-GI-NEXT: ldr d17, [x11, x8] +; CHECK-GI-NEXT: abs v19.4s, v19.4s +; CHECK-GI-NEXT: abs v4.4s, v4.4s ; CHECK-GI-NEXT: abs v16.4s, v16.4s ; CHECK-GI-NEXT: abs v3.4s, v3.4s ; CHECK-GI-NEXT: abs v18.4s, v18.4s @@ -4865,36 +4867,33 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b ; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 ; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 -; CHECK-GI-NEXT: abs v19.4s, v19.4s -; CHECK-GI-NEXT: abs v4.4s, v4.4s +; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s +; CHECK-GI-NEXT: addv s1, v1.4s +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s ; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 ; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 ; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s ; CHECK-GI-NEXT: abs v17.4s, v17.4s -; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: abs v6.4s, v6.4s -; CHECK-GI-NEXT: addv s0, v0.4s -; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s -; CHECK-GI-NEXT: addv s3, v3.4s +; CHECK-GI-NEXT: addv s4, v4.4s ; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 ; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 ; CHECK-GI-NEXT: abs v16.4s, v16.4s ; CHECK-GI-NEXT: abs v5.4s, v5.4s ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s -; CHECK-GI-NEXT: addv s2, v2.4s ; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: addv s4, v4.4s -; CHECK-GI-NEXT: fmov w10, s3 +; CHECK-GI-NEXT: addv s2, v2.4s +; CHECK-GI-NEXT: addv s3, v3.4s +; CHECK-GI-NEXT: fmov w10, s4 ; CHECK-GI-NEXT: abs v18.4s, v18.4s ; CHECK-GI-NEXT: abs v7.4s, v7.4s ; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: addv s3, v6.4s ; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: addv s3, v6.4s ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: fmov w10, s4 ; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s ; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: add w8, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/vselect-ext.ll b/llvm/test/CodeGen/AArch64/vselect-ext.ll index 0b90343a40c83..5414bacf42a92 100644 --- a/llvm/test/CodeGen/AArch64/vselect-ext.ll +++ b/llvm/test/CodeGen/AArch64/vselect-ext.ll @@ -8,15 +8,14 @@ define <16 x i32> @no_existing_zext(<16 x i8> %a, <16 x i32> %op) { ; CHECK-NEXT: cmhi.16b v0, v0, v5 ; CHECK-NEXT: sshll.8h v5, v0, #0 ; CHECK-NEXT: sshll2.8h v0, v0, #0 -; CHECK-NEXT: sshll2.4s v16, v0, #0 ; CHECK-NEXT: sshll.4s v6, v5, #0 ; CHECK-NEXT: sshll.4s v7, v0, #0 ; CHECK-NEXT: sshll2.4s v5, v5, #0 -; CHECK-NEXT: and.16b v4, v4, v16 +; CHECK-NEXT: sshll2.4s v16, v0, #0 ; CHECK-NEXT: and.16b v0, v1, v6 ; CHECK-NEXT: and.16b v1, v2, v5 ; CHECK-NEXT: and.16b v2, v3, v7 -; CHECK-NEXT: mov.16b v3, v4 +; CHECK-NEXT: and.16b v3, v4, v16 ; CHECK-NEXT: ret entry: %cmp = icmp ugt <16 x i8> %a, @@ -333,8 +332,8 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16 ; CHECK-LABEL: same_zext_used_in_cmp_unsigned_pred_and_select_other_use: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.16b v16, #10 -; CHECK-NEXT: ushll.8h v19, v0, #0 ; CHECK-NEXT: ldr q21, [sp] +; CHECK-NEXT: ushll.8h v19, v0, #0 ; CHECK-NEXT: ushll.4s v24, v19, #0 ; CHECK-NEXT: ushll2.4s v19, v19, #0 ; CHECK-NEXT: cmhi.16b v16, v0, v16 @@ -352,8 +351,8 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16 ; CHECK-NEXT: sshll2.2d v26, v17, #0 ; CHECK-NEXT: sshll.2d v27, v17, #0 ; CHECK-NEXT: and.16b v20, v21, v20 -; CHECK-NEXT: sshll2.2d v21, v22, #0 ; CHECK-NEXT: and.16b v7, v7, v23 +; CHECK-NEXT: sshll2.2d v21, v22, #0 ; CHECK-NEXT: sshll.2d v23, v22, #0 ; CHECK-NEXT: and.16b v6, v6, v26 ; CHECK-NEXT: sshll2.2d v26, v16, #0 @@ -361,16 +360,14 @@ define <16 x i32> @same_zext_used_in_cmp_unsigned_pred_and_select_other_use(<16 ; CHECK-NEXT: stp q7, q20, [x0, #96] ; CHECK-NEXT: sshll.2d v20, v16, #0 ; CHECK-NEXT: and.16b v21, v4, v21 -; CHECK-NEXT: and.16b v4, v0, v18 ; CHECK-NEXT: and.16b v7, v3, v23 -; CHECK-NEXT: and.16b v3, v19, v22 -; CHECK-NEXT: stp q5, q6, [x0, #64] +; CHECK-NEXT: and.16b v3, v0, v18 ; CHECK-NEXT: and.16b v0, v24, v16 +; CHECK-NEXT: stp q5, q6, [x0, #64] ; CHECK-NEXT: and.16b v6, v2, v26 ; CHECK-NEXT: and.16b v2, v25, v17 ; CHECK-NEXT: and.16b v5, v1, v20 -; CHECK-NEXT: mov.16b v1, v3 -; CHECK-NEXT: mov.16b v3, v4 +; CHECK-NEXT: and.16b v1, v19, v22 ; CHECK-NEXT: stp q7, q21, [x0, #32] ; CHECK-NEXT: stp q5, q6, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 66bb131ce7249..38ad96df79cf6 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2848,21 +2848,21 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1] ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] -; CHECK-BE-NEXT: add x8, x0, #16 -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] ; CHECK-BE-NEXT: add x9, x0, #48 ; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] +; CHECK-BE-NEXT: add x8, x0, #16 ; CHECK-BE-NEXT: umull v4.4s, v1.4h, v2.4h -; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h -; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h ; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h +; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v0.4s }, [x10] ; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: b.ne .LBB24_1 @@ -2980,16 +2980,16 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v6.8b, v6.8b ; CHECK-BE-NEXT: rev32 v17.8b, v17.8b -; CHECK-BE-NEXT: rev32 v19.8b, v19.8b ; CHECK-BE-NEXT: umull v5.2d, v5.2s, v18.2s +; CHECK-BE-NEXT: rev32 v19.8b, v19.8b ; CHECK-BE-NEXT: umull v18.2d, v21.2s, v22.2s ; CHECK-BE-NEXT: ext v21.16b, v22.16b, v22.16b, #8 ; CHECK-BE-NEXT: rev32 v7.8b, v7.8b @@ -2997,9 +2997,9 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: ext v16.16b, v16.16b, v16.16b, #8 ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s -; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s ; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s +; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s ; CHECK-BE-NEXT: add x8, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s @@ -3007,11 +3007,11 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: add x9, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] ; CHECK-BE-NEXT: st1 { v17.2d }, [x8] +; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: add x8, x0, #64 ; CHECK-BE-NEXT: st1 { v19.2d }, [x9] ; CHECK-BE-NEXT: add x9, x0, #48 ; CHECK-BE-NEXT: mov x0, x8 -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] ; CHECK-BE-NEXT: st1 { v4.2d }, [x10] ; CHECK-BE-NEXT: b.ne .LBB25_1 diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index bb968c8eb00fc..5cd7fd15735f3 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -354,9 +354,9 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-LABEL: zext_v3i16_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] ; CHECK-GI-NEXT: umov w10, v0.h[2] +; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -379,9 +379,9 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: zext_v3i32_v3i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: mov w10, v0.s[2] +; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -564,18 +564,11 @@ entry: } define <4 x i64> @zext_v4i32_v4i64(<4 x i32> %a) { -; CHECK-SD-LABEL: zext_v4i32_v4i64: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: zext_v4i32_v4i64: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-GI-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: zext_v4i32_v4i64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret entry: %c = zext <4 x i32> %a to <4 x i64> ret <4 x i64> %c @@ -696,18 +689,11 @@ entry: } define <8 x i32> @zext_v8i16_v8i32(<8 x i16> %a) { -; CHECK-SD-LABEL: zext_v8i16_v8i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: zext_v8i16_v8i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: zext_v8i16_v8i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ret entry: %c = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %c @@ -741,21 +727,19 @@ entry: define <8 x i64> @zext_v8i32_v8i64(<8 x i32> %a) { ; CHECK-SD-LABEL: zext_v8i32_v8i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v5.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll2 v4.2d, v0.4s, #0 ; CHECK-SD-NEXT: ushll2 v3.2d, v1.4s, #0 +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 -; CHECK-SD-NEXT: mov v0.16b, v5.16b ; CHECK-SD-NEXT: mov v1.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v8i32_v8i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v4.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0 ; CHECK-GI-NEXT: ushll v2.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -833,18 +817,11 @@ entry: } define <16 x i16> @zext_v16i8_v16i16(<16 x i8> %a) { -; CHECK-SD-LABEL: zext_v16i8_v16i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: zext_v16i8_v16i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: ushll2 v1.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov v0.16b, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: zext_v16i8_v16i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ret entry: %c = zext <16 x i8> %a to <16 x i16> ret <16 x i16> %c @@ -919,21 +896,19 @@ entry: define <16 x i32> @zext_v16i16_v16i32(<16 x i16> %a) { ; CHECK-SD-LABEL: zext_v16i16_v16i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v5.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll2 v4.4s, v0.8h, #0 ; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-SD-NEXT: mov v0.16b, v5.16b ; CHECK-SD-NEXT: mov v1.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v16i16_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: mov v0.16b, v4.16b ; CHECK-GI-NEXT: mov v1.16b, v5.16b ; CHECK-GI-NEXT: ret entry: @@ -981,30 +956,28 @@ entry: define <16 x i64> @zext_v16i32_v16i64(<16 x i32> %a) { ; CHECK-SD-LABEL: zext_v16i32_v16i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v17.2d, v0.4s, #0 -; CHECK-SD-NEXT: ushll2 v16.2d, v1.4s, #0 ; CHECK-SD-NEXT: ushll v18.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll2 v16.2d, v1.4s, #0 ; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-SD-NEXT: ushll2 v7.2d, v3.4s, #0 ; CHECK-SD-NEXT: ushll v4.2d, v2.2s, #0 ; CHECK-SD-NEXT: ushll2 v5.2d, v2.4s, #0 -; CHECK-SD-NEXT: ushll2 v7.2d, v3.4s, #0 ; CHECK-SD-NEXT: ushll v6.2d, v3.2s, #0 -; CHECK-SD-NEXT: mov v1.16b, v17.16b +; CHECK-SD-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-SD-NEXT: mov v2.16b, v18.16b ; CHECK-SD-NEXT: mov v3.16b, v16.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v16i32_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v16.2d, v0.2s, #0 +; CHECK-GI-NEXT: ushll2 v19.2d, v1.4s, #0 ; CHECK-GI-NEXT: ushll2 v17.2d, v0.4s, #0 ; CHECK-GI-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-GI-NEXT: ushll2 v19.2d, v1.4s, #0 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll v4.2d, v2.2s, #0 -; CHECK-GI-NEXT: ushll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: ushll v6.2d, v3.2s, #0 ; CHECK-GI-NEXT: ushll2 v7.2d, v3.4s, #0 -; CHECK-GI-NEXT: mov v0.16b, v16.16b +; CHECK-GI-NEXT: ushll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: mov v1.16b, v17.16b ; CHECK-GI-NEXT: mov v2.16b, v18.16b ; CHECK-GI-NEXT: mov v3.16b, v19.16b diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll index 5ebb115791c66..a14e943584f5f 100644 --- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll +++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll @@ -97,8 +97,7 @@ define <2 x i1> @saddo(ptr %ptr, ptr %ptr2) { ; CHECK-NEXT: vand q9, q9, q10 ; CHECK-NEXT: vmvn q9, q9 ; CHECK-NEXT: vmovn.i64 d18, q9 -; CHECK-NEXT: vmov r2, r1, d18 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d18 ; CHECK-NEXT: bx lr %x = load <2 x i64>, ptr %ptr, align 8 %y = load <2 x i64>, ptr %ptr2, align 8 @@ -122,8 +121,7 @@ define <2 x i1> @ssubo(ptr %ptr, ptr %ptr2) { ; CHECK-NEXT: vand q9, q9, q10 ; CHECK-NEXT: vmvn q9, q9 ; CHECK-NEXT: vmovn.i64 d18, q9 -; CHECK-NEXT: vmov r2, r1, d18 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d18 ; CHECK-NEXT: bx lr %x = load <2 x i64>, ptr %ptr, align 8 %y = load <2 x i64>, ptr %ptr2, align 8 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll index 78090083a0026..b8ea7c10ad2f4 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -1268,7 +1268,6 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 -; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -1301,12 +1300,12 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s28 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r1, s20 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 +; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 @@ -1514,7 +1513,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 -; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -1547,12 +1545,12 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s28 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r1, s20 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 +; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 @@ -2459,24 +2457,23 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-NEXT: vorr q4, q0, q0 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2lz -; CHECK-NEXT: vmov r2, r12, d9 ; CHECK-NEXT: mvn r5, #0 ; CHECK-NEXT: subs r3, r0, r5 -; CHECK-NEXT: mov r6, #0 ; CHECK-NEXT: sbcs r3, r1, #0 -; CHECK-NEXT: mov r4, #0 +; CHECK-NEXT: mov r6, #0 ; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: movwlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: movne r3, r1 ; CHECK-NEXT: moveq r0, r5 ; CHECK-NEXT: rsbs r1, r0, #0 ; CHECK-NEXT: rscs r1, r3, #0 +; CHECK-NEXT: vmov r2, r1, d9 ; CHECK-NEXT: movwlt r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: movne r6, r0 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r12 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: subs r2, r0, r5 ; CHECK-NEXT: vmov.32 d0[0], r6 @@ -2640,9 +2637,9 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NEXT: subs r3, r0, r7 ; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: sbcs r3, r1, #0 -; CHECK-NEXT: mov r10, #0 -; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: vmov r9, s18 +; CHECK-NEXT: mov r3, #0 +; CHECK-NEXT: mov r10, #0 ; CHECK-NEXT: movwlt r3, #1 ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: movne r3, r1 @@ -2979,9 +2976,9 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEON-NEXT: subs r3, r0, r7 ; CHECK-NEON-NEXT: mov r4, #0 ; CHECK-NEON-NEXT: sbcs r3, r1, #0 -; CHECK-NEON-NEXT: mov r10, #0 -; CHECK-NEON-NEXT: mov r3, #0 ; CHECK-NEON-NEXT: vmov r8, s18 +; CHECK-NEON-NEXT: mov r3, #0 +; CHECK-NEON-NEXT: mov r10, #0 ; CHECK-NEON-NEXT: movwlt r3, #1 ; CHECK-NEON-NEXT: cmp r3, #0 ; CHECK-NEON-NEXT: movne r3, r1 @@ -3260,7 +3257,6 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 -; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -3293,12 +3289,12 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s28 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r1, s20 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 +; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 @@ -3503,7 +3499,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 -; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -3536,12 +3531,12 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s28 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 -; CHECK-NEON-NEXT: vmov r1, s20 -; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vmov r0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 +; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 diff --git a/llvm/test/CodeGen/ARM/funnel-shift.ll b/llvm/test/CodeGen/ARM/funnel-shift.ll index 5a7c4384428e1..befbb90a1527b 100644 --- a/llvm/test/CodeGen/ARM/funnel-shift.ll +++ b/llvm/test/CodeGen/ARM/funnel-shift.ll @@ -374,11 +374,10 @@ define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) { define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) { ; CHECK-LABEL: fshr_i64_const_overshift: ; CHECK: @ %bb.0: -; CHECK-NEXT: lsl r2, r0, #23 ; CHECK-NEXT: lsl r1, r1, #23 -; CHECK-NEXT: orr r2, r2, r3, lsr #9 +; CHECK-NEXT: lsl r2, r0, #23 ; CHECK-NEXT: orr r1, r1, r0, lsr #9 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: orr r0, r2, r3, lsr #9 ; CHECK-NEXT: bx lr %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105) ret i64 %f diff --git a/llvm/test/CodeGen/ARM/llvm.exp10.ll b/llvm/test/CodeGen/ARM/llvm.exp10.ll index eb72fe8c1e1b7..0e057c48cb5f1 100644 --- a/llvm/test/CodeGen/ARM/llvm.exp10.ll +++ b/llvm/test/CodeGen/ARM/llvm.exp10.ll @@ -279,16 +279,12 @@ define <3 x double> @exp10_v3f64(<3 x double> %x) { ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl exp10 -; CHECK-NEXT: ldrd r2, r3, [sp, #24] ; CHECK-NEXT: vmov d8, r0, r1 -; CHECK-NEXT: mov r1, r3 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: ldrd r0, r1, [sp, #24] ; CHECK-NEXT: bl exp10 -; CHECK-NEXT: ldrd r2, r3, [sp, #32] ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: ldrd r0, r1, [sp, #32] ; CHECK-NEXT: vst1.64 {d8, d9}, [r4:128]! -; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl exp10 ; CHECK-NEXT: strd r0, r1, [r4] ; CHECK-NEXT: vpop {d8, d9} @@ -309,10 +305,8 @@ define <4 x double> @exp10_v4f64(<4 x double> %x) { ; CHECK-NEXT: add r2, sp, #64 ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vld1.64 {d16, d17}, [r2] -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov r0, r1, d17 ; CHECK-NEXT: vmov r5, r8, d16 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl exp10 ; CHECK-NEXT: mov r7, r0 ; CHECK-NEXT: mov r6, r1 diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll index 4b6d14efd0ecb..ff8ed66faca4d 100644 --- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll +++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll @@ -308,9 +308,7 @@ define i64 @load_i64_by_i8_bswap(ptr %arg) { define i64 @load_i64_by_i8(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: ldm r0, {r0, r1} ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8: @@ -320,16 +318,12 @@ define i64 @load_i64_by_i8(ptr %arg) { ; ; CHECK-THUMBv6-LABEL: load_i64_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldr r2, [r0] -; CHECK-THUMBv6-NEXT: ldr r1, [r0, #4] -; CHECK-THUMBv6-NEXT: mov r0, r2 +; CHECK-THUMBv6-NEXT: ldm r0, {r0, r1} ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i64_by_i8: ; CHECK-THUMBv7: @ %bb.0: -; CHECK-THUMBv7-NEXT: ldr r2, [r0] -; CHECK-THUMBv7-NEXT: ldr r1, [r0, #4] -; CHECK-THUMBv7-NEXT: mov r0, r2 +; CHECK-THUMBv7-NEXT: ldm r0, {r0, r1} ; CHECK-THUMBv7-NEXT: bx lr %tmp1 = load i8, ptr %arg, align 8 diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll index 0f6ec8aa47386..b0fa25610fc15 100644 --- a/llvm/test/CodeGen/ARM/load-combine.ll +++ b/llvm/test/CodeGen/ARM/load-combine.ll @@ -170,9 +170,7 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) { define i64 @load_i64_by_i8(ptr %arg) { ; CHECK-LABEL: load_i64_by_i8: ; CHECK: @ %bb.0: -; CHECK-NEXT: ldr r2, [r0] -; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: ldm r0, {r0, r1} ; CHECK-NEXT: mov pc, lr ; ; CHECK-ARMv6-LABEL: load_i64_by_i8: @@ -182,9 +180,7 @@ define i64 @load_i64_by_i8(ptr %arg) { ; ; CHECK-THUMBv6-LABEL: load_i64_by_i8: ; CHECK-THUMBv6: @ %bb.0: -; CHECK-THUMBv6-NEXT: ldr r2, [r0] -; CHECK-THUMBv6-NEXT: ldr r1, [r0, #4] -; CHECK-THUMBv6-NEXT: mov r0, r2 +; CHECK-THUMBv6-NEXT: ldm r0, {r0, r1} ; CHECK-THUMBv6-NEXT: bx lr ; ; CHECK-THUMBv7-LABEL: load_i64_by_i8: diff --git a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll index 046bbbde68642..b9ca841aa317c 100644 --- a/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll +++ b/llvm/test/CodeGen/ARM/sub-cmp-peephole.ll @@ -186,9 +186,7 @@ define double @double_sub(i32 %a, i32 %b, double %x, double %y) { ; CHECK-V7-NEXT: movw r1, :lower16:t ; CHECK-V7-NEXT: movt r1, :upper16:t ; CHECK-V7-NEXT: str r0, [r1] -; CHECK-V7-NEXT: vmov r2, r3, d16 -; CHECK-V7-NEXT: mov r0, r2 -; CHECK-V7-NEXT: mov r1, r3 +; CHECK-V7-NEXT: vmov r0, r1, d16 ; CHECK-V7-NEXT: bx lr ; ; CHECK-V8-LABEL: double_sub: @@ -197,13 +195,11 @@ define double @double_sub(i32 %a, i32 %b, double %x, double %y) { ; CHECK-V8-NEXT: cmp r0, r1 ; CHECK-V8-NEXT: vmov d17, r2, r3 ; CHECK-V8-NEXT: sub r0, r0, r1 -; CHECK-V8-NEXT: vselgt.f64 d16, d17, d16 ; CHECK-V8-NEXT: movw r1, :lower16:t -; CHECK-V8-NEXT: vmov r2, r3, d16 +; CHECK-V8-NEXT: vselgt.f64 d16, d17, d16 ; CHECK-V8-NEXT: movt r1, :upper16:t ; CHECK-V8-NEXT: str r0, [r1] -; CHECK-V8-NEXT: mov r0, r2 -; CHECK-V8-NEXT: mov r1, r3 +; CHECK-V8-NEXT: vmov r0, r1, d16 ; CHECK-V8-NEXT: bx lr entry: %cmp = icmp sgt i32 %a, %b @@ -224,9 +220,7 @@ define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) { ; CHECK-V7-NEXT: movw r1, :lower16:t ; CHECK-V7-NEXT: movt r1, :upper16:t ; CHECK-V7-NEXT: str r0, [r1] -; CHECK-V7-NEXT: vmov r2, r3, d16 -; CHECK-V7-NEXT: mov r0, r2 -; CHECK-V7-NEXT: mov r1, r3 +; CHECK-V7-NEXT: vmov r0, r1, d16 ; CHECK-V7-NEXT: bx lr ; ; CHECK-V8-LABEL: double_sub_swap: @@ -235,13 +229,11 @@ define double @double_sub_swap(i32 %a, i32 %b, double %x, double %y) { ; CHECK-V8-NEXT: cmp r1, r0 ; CHECK-V8-NEXT: vmov d17, r2, r3 ; CHECK-V8-NEXT: sub r0, r1, r0 -; CHECK-V8-NEXT: vselge.f64 d16, d16, d17 ; CHECK-V8-NEXT: movw r1, :lower16:t -; CHECK-V8-NEXT: vmov r2, r3, d16 +; CHECK-V8-NEXT: vselge.f64 d16, d16, d17 ; CHECK-V8-NEXT: movt r1, :upper16:t ; CHECK-V8-NEXT: str r0, [r1] -; CHECK-V8-NEXT: mov r0, r2 -; CHECK-V8-NEXT: mov r1, r3 +; CHECK-V8-NEXT: vmov r0, r1, d16 ; CHECK-V8-NEXT: bx lr entry: %cmp = icmp sgt i32 %a, %b diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll index fe81324d6679b..c40dd2e922963 100644 --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -96,15 +96,14 @@ define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind { ; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #32] -; CHECK-NEXT: ldr lr, [sp, #36] -; CHECK-NEXT: ldr r4, [sp, #40] -; CHECK-NEXT: ldr r5, [sp, #44] -; CHECK-NEXT: stm sp, {r0, r1, r2, r3} -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r1, lr -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: str r2, [sp, #8] +; CHECK-NEXT: str r3, [sp, #12] +; CHECK-NEXT: ldr r0, [sp, #32] +; CHECK-NEXT: ldr r1, [sp, #36] +; CHECK-NEXT: ldr r2, [sp, #40] +; CHECK-NEXT: ldr r3, [sp, #44] ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: pop {r4, r5, r11, lr} @@ -194,15 +193,14 @@ define fp128 @test_v2f128(<2 x fp128> %a, fp128 %s) nounwind { ; CHECK-NEXT: push {r4, r5, r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: ldr r12, [sp, #48] -; CHECK-NEXT: ldr lr, [sp, #52] -; CHECK-NEXT: ldr r4, [sp, #56] -; CHECK-NEXT: ldr r5, [sp, #60] -; CHECK-NEXT: stm sp, {r0, r1, r2, r3} -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r1, lr -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: mov r3, r5 +; CHECK-NEXT: str r0, [sp] +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: str r2, [sp, #8] +; CHECK-NEXT: str r3, [sp, #12] +; CHECK-NEXT: ldr r0, [sp, #48] +; CHECK-NEXT: ldr r1, [sp, #52] +; CHECK-NEXT: ldr r2, [sp, #56] +; CHECK-NEXT: ldr r3, [sp, #60] ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: ldr r4, [sp, #32] ; CHECK-NEXT: ldr r5, [sp, #40] diff --git a/llvm/test/CodeGen/ARM/vlddup.ll b/llvm/test/CodeGen/ARM/vlddup.ll index c43cb623f585c..659b2747cd6f6 100644 --- a/llvm/test/CodeGen/ARM/vlddup.ll +++ b/llvm/test/CodeGen/ARM/vlddup.ll @@ -21,8 +21,7 @@ define <8 x i8> @vld1dupi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind { ; CHECK-NEXT: add r3, r2, r1 ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vld1.8 {d16[]}, [r3] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr entry: %0 = load ptr, ptr %a, align 4 @@ -40,8 +39,7 @@ define <8 x i8> @vld1dupi8_postinc_fixed(ptr noalias nocapture %a) nounwind { ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: vld1.8 {d16[]}, [r3]! ; CHECK-NEXT: str r3, [r0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr entry: %0 = load ptr, ptr %a, align 4 @@ -59,8 +57,7 @@ define <8 x i8> @vld1dupi8_postinc_register(ptr noalias nocapture %a, i32 %n) no ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: vld1.8 {d16[]}, [r3], r1 ; CHECK-NEXT: str r3, [r0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr entry: %0 = load ptr, ptr %a, align 4 @@ -81,9 +78,8 @@ define <16 x i8> @vld1dupqi8_preinc(ptr noalias nocapture %a, i32 %b) nounwind { ; CHECK-NEXT: add lr, r2, r1 ; CHECK-NEXT: str lr, [r0] ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr] -; CHECK-NEXT: vmov r12, r1, d16 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr entry: @@ -104,9 +100,8 @@ define <16 x i8> @vld1dupqi8_postinc_fixed(ptr noalias nocapture %a) nounwind { ; CHECK-NEXT: ldr lr, [r0] ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr]! ; CHECK-NEXT: str lr, [r0] -; CHECK-NEXT: vmov r12, r1, d16 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr entry: @@ -127,9 +122,8 @@ define <16 x i8> @vld1dupqi8_postinc_register(ptr noalias nocapture %a, i32 %n) ; CHECK-NEXT: ldr lr, [r0] ; CHECK-NEXT: vld1.8 {d16[], d17[]}, [lr], r1 ; CHECK-NEXT: str lr, [r0] -; CHECK-NEXT: vmov r12, r1, d16 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 -; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr entry: @@ -420,8 +414,7 @@ define <4 x i16> @vld2dupi16_update(ptr %ptr) nounwind { ; CHECK-NEXT: vadd.i16 d16, d16, d17 ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vdup.16 d16, d16[0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) @@ -444,8 +437,7 @@ define <4 x i16> @vld2dupi16_odd_update(ptr %ptr) nounwind { ; CHECK-NEXT: vadd.i16 d16, d16, d17 ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vdup.16 d16, d16[0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2) @@ -494,8 +486,7 @@ define <8 x i8> @vld3dupi8_update(ptr %ptr, i32 %inc) nounwind { ; CHECK-NEXT: vadd.i8 d16, d20, d18 ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vdup.8 d16, d16[0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8) @@ -551,8 +542,7 @@ define <4 x i16> @vld4dupi16_update(ptr %ptr) nounwind { ; CHECK-NEXT: str r3, [r0] ; CHECK-NEXT: vadd.i16 d16, d16, d20 ; CHECK-NEXT: vdup.16 d16, d16[0] -; CHECK-NEXT: vmov r2, r1, d16 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1) diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll index f6ddc9988877e..b7d3e5ed8e9d7 100644 --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -199,8 +199,7 @@ define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind { ; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3]! ; DEFAULT-NEXT: vadd.i32 d16, d16, d17 ; DEFAULT-NEXT: str r3, [r0] -; DEFAULT-NEXT: vmov r2, r1, d16 -; DEFAULT-NEXT: mov r0, r2 +; DEFAULT-NEXT: vmov r0, r1, d16 ; DEFAULT-NEXT: mov pc, lr ; ; BASIC-LABEL: vld2lanei32_update: @@ -213,9 +212,7 @@ define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind { ; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0]! ; BASIC-NEXT: vadd.i32 d16, d16, d17 ; BASIC-NEXT: str r0, [r1] -; BASIC-NEXT: vmov r2, r3, d16 -; BASIC-NEXT: mov r0, r2 -; BASIC-NEXT: mov r1, r3 +; BASIC-NEXT: vmov r0, r1, d16 ; BASIC-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp1 = load <2 x i32>, ptr %B @@ -238,8 +235,7 @@ define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind { ; DEFAULT-NEXT: vld2.32 {d16[1], d17[1]}, [r3], r1 ; DEFAULT-NEXT: vadd.i32 d16, d16, d17 ; DEFAULT-NEXT: str r3, [r0] -; DEFAULT-NEXT: vmov r2, r1, d16 -; DEFAULT-NEXT: mov r0, r2 +; DEFAULT-NEXT: vmov r0, r1, d16 ; DEFAULT-NEXT: mov pc, lr ; ; BASIC-LABEL: vld2lanei32_odd_update: @@ -253,9 +249,7 @@ define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind { ; BASIC-NEXT: vld2.32 {d16[1], d17[1]}, [r0], r2 ; BASIC-NEXT: vadd.i32 d16, d16, d17 ; BASIC-NEXT: str r0, [r1] -; BASIC-NEXT: vmov r2, r3, d16 -; BASIC-NEXT: mov r0, r2 -; BASIC-NEXT: mov r1, r3 +; BASIC-NEXT: vmov r0, r1, d16 ; BASIC-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp1 = load <2 x i32>, ptr %B @@ -538,9 +532,8 @@ define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind { ; DEFAULT-NEXT: vadd.i16 q12, q8, q9 ; DEFAULT-NEXT: vadd.i16 q8, q10, q12 ; DEFAULT-NEXT: str lr, [r0] -; DEFAULT-NEXT: vmov r12, r1, d16 +; DEFAULT-NEXT: vmov r0, r1, d16 ; DEFAULT-NEXT: vmov r2, r3, d17 -; DEFAULT-NEXT: mov r0, r12 ; DEFAULT-NEXT: pop {r11, lr} ; DEFAULT-NEXT: mov pc, lr ; @@ -558,11 +551,8 @@ define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind { ; BASIC-NEXT: vadd.i16 q8, q9, q10 ; BASIC-NEXT: vadd.i16 q8, q11, q8 ; BASIC-NEXT: str r0, [r3] -; BASIC-NEXT: vmov r1, lr, d16 -; BASIC-NEXT: vmov r2, r12, d17 -; BASIC-NEXT: mov r0, r1 -; BASIC-NEXT: mov r1, lr -; BASIC-NEXT: mov r3, r12 +; BASIC-NEXT: vmov r0, r1, d16 +; BASIC-NEXT: vmov r2, r3, d17 ; BASIC-NEXT: pop {r11, lr} ; BASIC-NEXT: mov pc, lr %A = load ptr, ptr %ptr @@ -704,8 +694,7 @@ define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind { ; DEFAULT-NEXT: vadd.i8 d20, d18, d19 ; DEFAULT-NEXT: str r3, [r0] ; DEFAULT-NEXT: vadd.i8 d16, d16, d20 -; DEFAULT-NEXT: vmov r2, r1, d16 -; DEFAULT-NEXT: mov r0, r2 +; DEFAULT-NEXT: vmov r0, r1, d16 ; DEFAULT-NEXT: mov pc, lr ; ; BASIC-LABEL: vld4lanei8_update: @@ -721,9 +710,7 @@ define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind { ; BASIC-NEXT: vadd.i8 d20, d18, d19 ; BASIC-NEXT: str r0, [r3] ; BASIC-NEXT: vadd.i8 d16, d16, d20 -; BASIC-NEXT: vmov r1, r2, d16 -; BASIC-NEXT: mov r0, r1 -; BASIC-NEXT: mov r1, r2 +; BASIC-NEXT: vmov r0, r1, d16 ; BASIC-NEXT: mov pc, lr %A = load ptr, ptr %ptr %tmp1 = load <8 x i8>, ptr %B diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll index f032756e007b6..51589f216a362 100644 --- a/llvm/test/CodeGen/RISCV/alu64.ll +++ b/llvm/test/CodeGen/RISCV/alu64.ll @@ -326,9 +326,8 @@ define i64 @sra(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: sra a1, a1, a2 ; RV32I-NEXT: bltz a4, .LBB16_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a3, a3, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: srai a1, a3, 31 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB16_2: ; RV32I-NEXT: srl a0, a0, a2 @@ -437,9 +436,8 @@ define i64 @sraiw_i64(i64 %a) nounwind { ; ; RV32I-LABEL: sraiw_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: srai a2, a0, 9 ; RV32I-NEXT: srai a1, a0, 31 -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: srai a0, a0, 9 ; RV32I-NEXT: ret %1 = shl i64 %a, 32 %2 = ashr i64 %1, 41 diff --git a/llvm/test/CodeGen/RISCV/branch-on-zero.ll b/llvm/test/CodeGen/RISCV/branch-on-zero.ll index 02aeebdeb3775..b77809308ad9c 100644 --- a/llvm/test/CodeGen/RISCV/branch-on-zero.ll +++ b/llvm/test/CodeGen/RISCV/branch-on-zero.ll @@ -129,9 +129,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV32-NEXT: lw a3, 0(a1) ; RV32-NEXT: addi a4, a1, 4 ; RV32-NEXT: slli a3, a3, 1 -; RV32-NEXT: addi a1, a0, 4 ; RV32-NEXT: sw a3, 0(a0) -; RV32-NEXT: mv a0, a1 +; RV32-NEXT: addi a0, a0, 4 ; RV32-NEXT: mv a1, a4 ; RV32-NEXT: bne a4, a2, .LBB3_2 ; RV32-NEXT: .LBB3_3: # %while.end @@ -153,9 +152,8 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) { ; RV64-NEXT: lw a3, 0(a1) ; RV64-NEXT: addi a4, a1, 4 ; RV64-NEXT: slli a3, a3, 1 -; RV64-NEXT: addi a1, a0, 4 ; RV64-NEXT: sw a3, 0(a0) -; RV64-NEXT: mv a0, a1 +; RV64-NEXT: addi a0, a0, 4 ; RV64-NEXT: mv a1, a4 ; RV64-NEXT: bne a4, a2, .LBB3_2 ; RV64-NEXT: .LBB3_3: # %while.end diff --git a/llvm/test/CodeGen/RISCV/condops.ll b/llvm/test/CodeGen/RISCV/condops.ll index 622365cf13bce..650b57acafc2b 100644 --- a/llvm/test/CodeGen/RISCV/condops.ll +++ b/llvm/test/CodeGen/RISCV/condops.ll @@ -455,9 +455,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: and a0, a0, a4 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: sub a2, a2, a5 ; RV32I-NEXT: sub a0, a1, a3 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: sub a1, a2, a5 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sub1: @@ -473,9 +472,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-NEXT: sltu a5, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskc a0, a4, a0 ; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a5 ; RV32XVENTANACONDOPS-NEXT: sub a0, a1, a3 -; RV32XVENTANACONDOPS-NEXT: mv a1, a2 +; RV32XVENTANACONDOPS-NEXT: sub a1, a2, a5 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: sub1: @@ -496,9 +494,8 @@ define i64 @sub1(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-NEXT: sltu a5, a1, a3 ; RV32ZICOND-NEXT: czero.eqz a0, a4, a0 ; RV32ZICOND-NEXT: sub a2, a2, a0 -; RV32ZICOND-NEXT: sub a2, a2, a5 ; RV32ZICOND-NEXT: sub a0, a1, a3 -; RV32ZICOND-NEXT: mv a1, a2 +; RV32ZICOND-NEXT: sub a1, a2, a5 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: sub1: @@ -519,9 +516,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32I-NEXT: sltu a5, a1, a3 ; RV32I-NEXT: and a0, a0, a4 ; RV32I-NEXT: sub a2, a2, a0 -; RV32I-NEXT: sub a2, a2, a5 ; RV32I-NEXT: sub a0, a1, a3 -; RV32I-NEXT: mv a1, a2 +; RV32I-NEXT: sub a1, a2, a5 ; RV32I-NEXT: ret ; ; RV64I-LABEL: sub2: @@ -537,9 +533,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32XVENTANACONDOPS-NEXT: sltu a5, a1, a3 ; RV32XVENTANACONDOPS-NEXT: vt.maskcn a0, a4, a0 ; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a0 -; RV32XVENTANACONDOPS-NEXT: sub a2, a2, a5 ; RV32XVENTANACONDOPS-NEXT: sub a0, a1, a3 -; RV32XVENTANACONDOPS-NEXT: mv a1, a2 +; RV32XVENTANACONDOPS-NEXT: sub a1, a2, a5 ; RV32XVENTANACONDOPS-NEXT: ret ; ; RV64XVENTANACONDOPS-LABEL: sub2: @@ -560,9 +555,8 @@ define i64 @sub2(i1 zeroext %rc, i64 %rs1, i64 %rs2) { ; RV32ZICOND-NEXT: sltu a5, a1, a3 ; RV32ZICOND-NEXT: czero.nez a0, a4, a0 ; RV32ZICOND-NEXT: sub a2, a2, a0 -; RV32ZICOND-NEXT: sub a2, a2, a5 ; RV32ZICOND-NEXT: sub a0, a1, a3 -; RV32ZICOND-NEXT: mv a1, a2 +; RV32ZICOND-NEXT: sub a1, a2, a5 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: sub2: diff --git a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll index e864d8fb0eddd..8f88fbf4c0586 100644 --- a/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/double-fcmp-strict.ll @@ -288,9 +288,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: csrr a4, fflags ; RV32IZFINXZDINX-NEXT: flt.d a6, a2, a0 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 -; RV32IZFINXZDINX-NEXT: or a4, a6, a5 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: or a0, a6, a5 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_one: @@ -302,9 +301,8 @@ define i32 @fcmp_one(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: csrr a2, fflags ; RV64IZFINXZDINX-NEXT: flt.d a4, a1, a0 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 -; RV64IZFINXZDINX-NEXT: or a2, a4, a3 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: or a0, a4, a3 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_one: @@ -438,9 +436,8 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: flt.d a6, a2, a0 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 ; RV32IZFINXZDINX-NEXT: or a4, a6, a5 -; RV32IZFINXZDINX-NEXT: xori a4, a4, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a4, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ueq: @@ -453,9 +450,8 @@ define i32 @fcmp_ueq(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: flt.d a4, a1, a0 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 ; RV64IZFINXZDINX-NEXT: or a3, a4, a3 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ueq: @@ -531,9 +527,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: csrr a4, fflags ; RV32IZFINXZDINX-NEXT: fle.d a5, a0, a2 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a0, a2 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ugt: @@ -541,9 +536,8 @@ define i32 @fcmp_ugt(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: csrr a2, fflags ; RV64IZFINXZDINX-NEXT: fle.d a3, a0, a1 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a0, a1 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ugt: @@ -585,9 +579,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: csrr a4, fflags ; RV32IZFINXZDINX-NEXT: flt.d a5, a0, a2 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a0, a2 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_uge: @@ -595,9 +588,8 @@ define i32 @fcmp_uge(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: csrr a2, fflags ; RV64IZFINXZDINX-NEXT: flt.d a3, a0, a1 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a0, a1 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_uge: @@ -641,9 +633,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: csrr a4, fflags ; RV32IZFINXZDINX-NEXT: fle.d a5, a2, a0 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ult: @@ -651,9 +642,8 @@ define i32 @fcmp_ult(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: csrr a2, fflags ; RV64IZFINXZDINX-NEXT: fle.d a3, a1, a0 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ult: @@ -695,9 +685,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp { ; RV32IZFINXZDINX-NEXT: csrr a4, fflags ; RV32IZFINXZDINX-NEXT: flt.d a5, a2, a0 ; RV32IZFINXZDINX-NEXT: csrw fflags, a4 -; RV32IZFINXZDINX-NEXT: xori a4, a5, 1 ; RV32IZFINXZDINX-NEXT: feq.d zero, a2, a0 -; RV32IZFINXZDINX-NEXT: mv a0, a4 +; RV32IZFINXZDINX-NEXT: xori a0, a5, 1 ; RV32IZFINXZDINX-NEXT: ret ; ; RV64IZFINXZDINX-LABEL: fcmp_ule: @@ -705,9 +694,8 @@ define i32 @fcmp_ule(double %a, double %b) nounwind strictfp { ; RV64IZFINXZDINX-NEXT: csrr a2, fflags ; RV64IZFINXZDINX-NEXT: flt.d a3, a1, a0 ; RV64IZFINXZDINX-NEXT: csrw fflags, a2 -; RV64IZFINXZDINX-NEXT: xori a2, a3, 1 ; RV64IZFINXZDINX-NEXT: feq.d zero, a1, a0 -; RV64IZFINXZDINX-NEXT: mv a0, a2 +; RV64IZFINXZDINX-NEXT: xori a0, a3, 1 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ule: diff --git a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll index dae9f3e089cf4..6c75edb479252 100644 --- a/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/float-fcmp-strict.ll @@ -247,9 +247,8 @@ define i32 @fcmp_one(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: csrr a2, fflags ; CHECKIZFINX-NEXT: flt.s a4, a1, a0 ; CHECKIZFINX-NEXT: csrw fflags, a2 -; CHECKIZFINX-NEXT: or a2, a4, a3 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: or a0, a4, a3 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_one: @@ -368,9 +367,8 @@ define i32 @fcmp_ueq(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: flt.s a4, a1, a0 ; CHECKIZFINX-NEXT: csrw fflags, a2 ; CHECKIZFINX-NEXT: or a3, a4, a3 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ueq: @@ -438,9 +436,8 @@ define i32 @fcmp_ugt(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: csrr a2, fflags ; CHECKIZFINX-NEXT: fle.s a3, a0, a1 ; CHECKIZFINX-NEXT: csrw fflags, a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a0, a1 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ugt: @@ -482,9 +479,8 @@ define i32 @fcmp_uge(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: csrr a2, fflags ; CHECKIZFINX-NEXT: flt.s a3, a0, a1 ; CHECKIZFINX-NEXT: csrw fflags, a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a0, a1 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_uge: @@ -528,9 +524,8 @@ define i32 @fcmp_ult(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: csrr a2, fflags ; CHECKIZFINX-NEXT: fle.s a3, a1, a0 ; CHECKIZFINX-NEXT: csrw fflags, a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ult: @@ -572,9 +567,8 @@ define i32 @fcmp_ule(float %a, float %b) nounwind strictfp { ; CHECKIZFINX-NEXT: csrr a2, fflags ; CHECKIZFINX-NEXT: flt.s a3, a1, a0 ; CHECKIZFINX-NEXT: csrw fflags, a2 -; CHECKIZFINX-NEXT: xori a2, a3, 1 ; CHECKIZFINX-NEXT: feq.s zero, a1, a0 -; CHECKIZFINX-NEXT: mv a0, a2 +; CHECKIZFINX-NEXT: xori a0, a3, 1 ; CHECKIZFINX-NEXT: ret ; ; RV32I-LABEL: fcmp_ule: diff --git a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll index d96c39c504e1f..f4f1279a8e18e 100644 --- a/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll +++ b/llvm/test/CodeGen/RISCV/half-fcmp-strict.ll @@ -235,9 +235,8 @@ define i32 @fcmp_one(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: csrr a2, fflags ; CHECKIZHINX-NEXT: flt.h a4, a1, a0 ; CHECKIZHINX-NEXT: csrw fflags, a2 -; CHECKIZHINX-NEXT: or a2, a4, a3 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: or a0, a4, a3 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_one: @@ -334,9 +333,8 @@ define i32 @fcmp_ueq(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: flt.h a4, a1, a0 ; CHECKIZHINX-NEXT: csrw fflags, a2 ; CHECKIZHINX-NEXT: or a3, a4, a3 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ueq: @@ -388,9 +386,8 @@ define i32 @fcmp_ugt(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: csrr a2, fflags ; CHECKIZHINX-NEXT: fle.h a3, a0, a1 ; CHECKIZHINX-NEXT: csrw fflags, a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a0, a1 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ugt: @@ -432,9 +429,8 @@ define i32 @fcmp_uge(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: csrr a2, fflags ; CHECKIZHINX-NEXT: flt.h a3, a0, a1 ; CHECKIZHINX-NEXT: csrw fflags, a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a0, a1 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_uge: @@ -476,9 +472,8 @@ define i32 @fcmp_ult(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: csrr a2, fflags ; CHECKIZHINX-NEXT: fle.h a3, a1, a0 ; CHECKIZHINX-NEXT: csrw fflags, a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ult: @@ -520,9 +515,8 @@ define i32 @fcmp_ule(half %a, half %b) nounwind strictfp { ; CHECKIZHINX-NEXT: csrr a2, fflags ; CHECKIZHINX-NEXT: flt.h a3, a1, a0 ; CHECKIZHINX-NEXT: csrw fflags, a2 -; CHECKIZHINX-NEXT: xori a2, a3, 1 ; CHECKIZHINX-NEXT: feq.h zero, a1, a0 -; CHECKIZHINX-NEXT: mv a0, a2 +; CHECKIZHINX-NEXT: xori a0, a3, 1 ; CHECKIZHINX-NEXT: ret ; ; CHECKIZFHMIN-LABEL: fcmp_ule: diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll index 30f9dd1e51658..23c885a1d2cb6 100644 --- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll +++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll @@ -641,8 +641,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: mv s3, a0 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -691,8 +691,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: mv s3, a0 +; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: mv a1, sp ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf @@ -741,10 +741,9 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32I-NEXT: lw s0, 12(a1) ; RV32I-NEXT: lw s1, 8(a1) ; RV32I-NEXT: lw s2, 4(a1) -; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 @@ -791,10 +790,9 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64I-NEXT: lw s0, 24(a1) ; RV64I-NEXT: lw s1, 16(a1) ; RV64I-NEXT: lw s2, 8(a1) -; RV64I-NEXT: lw a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lw a0, 0(a1) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 @@ -925,8 +923,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: mv s3, a0 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -967,8 +965,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: mv s3, a0 +; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: mv a1, sp ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf @@ -1009,10 +1007,9 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV32I-NEXT: lw s0, 12(a1) ; RV32I-NEXT: lw s1, 8(a1) ; RV32I-NEXT: lw s2, 4(a1) -; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: addi a1, sp, 8 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: addi a1, sp, 12 @@ -1051,10 +1048,9 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV64I-NEXT: lw s0, 24(a1) ; RV64I-NEXT: lw s1, 16(a1) ; RV64I-NEXT: lw s2, 8(a1) -; RV64I-NEXT: lw a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lw a0, 0(a1) ; RV64I-NEXT: mv a1, sp -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: addi a1, sp, 4 @@ -1175,8 +1171,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: mv s3, a0 +; RV32IZFINXZDINX-NEXT: mv a2, a1 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 12 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -1216,8 +1212,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: mv s3, a0 +; RV64IZFINXZDINX-NEXT: mv a2, a1 ; RV64IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf @@ -1257,10 +1253,9 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV32I-NEXT: lw s0, 12(a1) ; RV32I-NEXT: lw s1, 8(a1) ; RV32I-NEXT: lw s2, 4(a1) -; RV32I-NEXT: lw a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lw a0, 0(a1) ; RV32I-NEXT: addi a1, sp, 12 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call frexpf ; RV32I-NEXT: addi a1, sp, 16 ; RV32I-NEXT: mv a0, s2 @@ -1298,10 +1293,9 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV64I-NEXT: lw s0, 24(a1) ; RV64I-NEXT: lw s1, 16(a1) ; RV64I-NEXT: lw s2, 8(a1) -; RV64I-NEXT: lw a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lw a0, 0(a1) ; RV64I-NEXT: addi a1, sp, 8 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call frexpf ; RV64I-NEXT: addi a1, sp, 12 ; RV64I-NEXT: mv a0, s2 diff --git a/llvm/test/CodeGen/RISCV/machine-cp.mir b/llvm/test/CodeGen/RISCV/machine-cp.mir index f3674f89cd918..ce3d71f55e065 100644 --- a/llvm/test/CodeGen/RISCV/machine-cp.mir +++ b/llvm/test/CodeGen/RISCV/machine-cp.mir @@ -19,14 +19,15 @@ body: | ; RV32: liveins: $v28_v29_v30, $v8_v9, $v1 ; RV32-NEXT: {{ $}} ; RV32-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7 - ; RV32-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 - ; RV32-NEXT: PseudoRET implicit $v28 + ; RV32-NEXT: renamable $v28 = COPY killed renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 + ; RV32-NEXT: PseudoRET implicit killed $v28 + ; ; RV64-LABEL: name: foo ; RV64: liveins: $v28_v29_v30, $v8_v9, $v1 ; RV64-NEXT: {{ $}} ; RV64-NEXT: renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7 - ; RV64-NEXT: renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 - ; RV64-NEXT: PseudoRET implicit $v28 + ; RV64-NEXT: renamable $v28 = COPY killed renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 + ; RV64-NEXT: PseudoRET implicit killed $v28 renamable $v8 = COPY renamable $v1, implicit killed $v8_v9, implicit-def $v8_v9 renamable $v4_v5_v6_v7_v8_v9_v10_v11 = COPY killed renamable $v0_v1_v2_v3_v4_v5_v6_v7 renamable $v28 = COPY renamable $v8, implicit killed $v28_v29_v30, implicit-def $v28_v29_v30 diff --git a/llvm/test/CodeGen/RISCV/neg-abs.ll b/llvm/test/CodeGen/RISCV/neg-abs.ll index 6f301882b452c..48f725dba5dcc 100644 --- a/llvm/test/CodeGen/RISCV/neg-abs.ll +++ b/llvm/test/CodeGen/RISCV/neg-abs.ll @@ -211,10 +211,9 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { ; RV32I-NEXT: sw a0, 0(a2) ; RV32I-NEXT: snez a3, a0 ; RV32I-NEXT: neg a4, a1 -; RV32I-NEXT: sub a3, a4, a3 -; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: sw a1, 4(a2) -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: sub a1, a4, a3 +; RV32I-NEXT: neg a0, a0 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: neg_abs64_multiuse: @@ -229,10 +228,9 @@ define i64 @neg_abs64_multiuse(i64 %x, ptr %y) { ; RV32ZBB-NEXT: sw a0, 0(a2) ; RV32ZBB-NEXT: snez a3, a0 ; RV32ZBB-NEXT: neg a4, a1 -; RV32ZBB-NEXT: sub a3, a4, a3 -; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: sw a1, 4(a2) -; RV32ZBB-NEXT: mv a1, a3 +; RV32ZBB-NEXT: sub a1, a4, a3 +; RV32ZBB-NEXT: neg a0, a0 ; RV32ZBB-NEXT: ret ; ; RV64I-LABEL: neg_abs64_multiuse: diff --git a/llvm/test/CodeGen/RISCV/nontemporal.ll b/llvm/test/CodeGen/RISCV/nontemporal.ll index 4c5c36fc72d14..4d2aa667a402f 100644 --- a/llvm/test/CodeGen/RISCV/nontemporal.ll +++ b/llvm/test/CodeGen/RISCV/nontemporal.ll @@ -16,10 +16,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) { ; CHECK-RV32-LABEL: test_nontemporal_load_i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: lw a2, 0(a0) -; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: lw a1, 4(a0) -; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a0, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_load_i64: @@ -31,10 +30,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) { ; CHECK-RV32C-LABEL: test_nontemporal_load_i64: ; CHECK-RV32C: # %bb.0: ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: lw a2, 0(a0) -; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: lw a1, 4(a0) -; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a0, 0(a0) ; CHECK-RV32C-NEXT: ret ; ; CHECK-RV64V-LABEL: test_nontemporal_load_i64: @@ -46,10 +44,9 @@ define i64 @test_nontemporal_load_i64(ptr %p) { ; CHECK-RV32V-LABEL: test_nontemporal_load_i64: ; CHECK-RV32V: # %bb.0: ; CHECK-RV32V-NEXT: ntl.all -; CHECK-RV32V-NEXT: lw a2, 0(a0) -; CHECK-RV32V-NEXT: ntl.all ; CHECK-RV32V-NEXT: lw a1, 4(a0) -; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a0, 0(a0) ; CHECK-RV32V-NEXT: ret %1 = load i64, ptr %p, !nontemporal !0 @@ -540,10 +537,9 @@ define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) { ; CHECK-RV64-LABEL: test_nontemporal_load_v2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: ld a2, 0(a0) -; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: ld a1, 8(a0) -; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_load_v2i64: @@ -565,10 +561,9 @@ define <2 x i64> @test_nontemporal_load_v2i64(ptr %p) { ; CHECK-RV64C-LABEL: test_nontemporal_load_v2i64: ; CHECK-RV64C: # %bb.0: ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: ld a2, 0(a0) -; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: ld a1, 8(a0) -; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a0, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_load_v2i64: @@ -1448,10 +1443,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) { ; CHECK-RV32-LABEL: test_nontemporal_P1_load_i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: ntl.p1 -; CHECK-RV32-NEXT: lw a2, 0(a0) -; CHECK-RV32-NEXT: ntl.p1 ; CHECK-RV32-NEXT: lw a1, 4(a0) -; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ntl.p1 +; CHECK-RV32-NEXT: lw a0, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_P1_load_i64: @@ -1463,10 +1457,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) { ; CHECK-RV32C-LABEL: test_nontemporal_P1_load_i64: ; CHECK-RV32C: # %bb.0: ; CHECK-RV32C-NEXT: c.ntl.p1 -; CHECK-RV32C-NEXT: lw a2, 0(a0) -; CHECK-RV32C-NEXT: c.ntl.p1 ; CHECK-RV32C-NEXT: lw a1, 4(a0) -; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: c.ntl.p1 +; CHECK-RV32C-NEXT: lw a0, 0(a0) ; CHECK-RV32C-NEXT: ret ; ; CHECK-RV64V-LABEL: test_nontemporal_P1_load_i64: @@ -1478,10 +1471,9 @@ define i64 @test_nontemporal_P1_load_i64(ptr %p) { ; CHECK-RV32V-LABEL: test_nontemporal_P1_load_i64: ; CHECK-RV32V: # %bb.0: ; CHECK-RV32V-NEXT: ntl.p1 -; CHECK-RV32V-NEXT: lw a2, 0(a0) -; CHECK-RV32V-NEXT: ntl.p1 ; CHECK-RV32V-NEXT: lw a1, 4(a0) -; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ntl.p1 +; CHECK-RV32V-NEXT: lw a0, 0(a0) ; CHECK-RV32V-NEXT: ret %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !1 ret i64 %1 @@ -1962,10 +1954,9 @@ define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) { ; CHECK-RV64-LABEL: test_nontemporal_P1_load_v2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ntl.p1 -; CHECK-RV64-NEXT: ld a2, 0(a0) -; CHECK-RV64-NEXT: ntl.p1 ; CHECK-RV64-NEXT: ld a1, 8(a0) -; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ntl.p1 +; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_P1_load_v2i64: @@ -1987,10 +1978,9 @@ define <2 x i64> @test_nontemporal_P1_load_v2i64(ptr %p) { ; CHECK-RV64C-LABEL: test_nontemporal_P1_load_v2i64: ; CHECK-RV64C: # %bb.0: ; CHECK-RV64C-NEXT: c.ntl.p1 -; CHECK-RV64C-NEXT: ld a2, 0(a0) -; CHECK-RV64C-NEXT: c.ntl.p1 ; CHECK-RV64C-NEXT: ld a1, 8(a0) -; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: c.ntl.p1 +; CHECK-RV64C-NEXT: ld a0, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_P1_load_v2i64: @@ -2862,10 +2852,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) { ; CHECK-RV32-LABEL: test_nontemporal_PALL_load_i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: ntl.pall -; CHECK-RV32-NEXT: lw a2, 0(a0) -; CHECK-RV32-NEXT: ntl.pall ; CHECK-RV32-NEXT: lw a1, 4(a0) -; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ntl.pall +; CHECK-RV32-NEXT: lw a0, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_i64: @@ -2877,10 +2866,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) { ; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_i64: ; CHECK-RV32C: # %bb.0: ; CHECK-RV32C-NEXT: c.ntl.pall -; CHECK-RV32C-NEXT: lw a2, 0(a0) -; CHECK-RV32C-NEXT: c.ntl.pall ; CHECK-RV32C-NEXT: lw a1, 4(a0) -; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: c.ntl.pall +; CHECK-RV32C-NEXT: lw a0, 0(a0) ; CHECK-RV32C-NEXT: ret ; ; CHECK-RV64V-LABEL: test_nontemporal_PALL_load_i64: @@ -2892,10 +2880,9 @@ define i64 @test_nontemporal_PALL_load_i64(ptr %p) { ; CHECK-RV32V-LABEL: test_nontemporal_PALL_load_i64: ; CHECK-RV32V: # %bb.0: ; CHECK-RV32V-NEXT: ntl.pall -; CHECK-RV32V-NEXT: lw a2, 0(a0) -; CHECK-RV32V-NEXT: ntl.pall ; CHECK-RV32V-NEXT: lw a1, 4(a0) -; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ntl.pall +; CHECK-RV32V-NEXT: lw a0, 0(a0) ; CHECK-RV32V-NEXT: ret %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !2 ret i64 %1 @@ -3376,10 +3363,9 @@ define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) { ; CHECK-RV64-LABEL: test_nontemporal_PALL_load_v2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ntl.pall -; CHECK-RV64-NEXT: ld a2, 0(a0) -; CHECK-RV64-NEXT: ntl.pall ; CHECK-RV64-NEXT: ld a1, 8(a0) -; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ntl.pall +; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_PALL_load_v2i64: @@ -3401,10 +3387,9 @@ define <2 x i64> @test_nontemporal_PALL_load_v2i64(ptr %p) { ; CHECK-RV64C-LABEL: test_nontemporal_PALL_load_v2i64: ; CHECK-RV64C: # %bb.0: ; CHECK-RV64C-NEXT: c.ntl.pall -; CHECK-RV64C-NEXT: ld a2, 0(a0) -; CHECK-RV64C-NEXT: c.ntl.pall ; CHECK-RV64C-NEXT: ld a1, 8(a0) -; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: c.ntl.pall +; CHECK-RV64C-NEXT: ld a0, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_PALL_load_v2i64: @@ -4276,10 +4261,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) { ; CHECK-RV32-LABEL: test_nontemporal_S1_load_i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: ntl.s1 -; CHECK-RV32-NEXT: lw a2, 0(a0) -; CHECK-RV32-NEXT: ntl.s1 ; CHECK-RV32-NEXT: lw a1, 4(a0) -; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ntl.s1 +; CHECK-RV32-NEXT: lw a0, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_S1_load_i64: @@ -4291,10 +4275,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) { ; CHECK-RV32C-LABEL: test_nontemporal_S1_load_i64: ; CHECK-RV32C: # %bb.0: ; CHECK-RV32C-NEXT: c.ntl.s1 -; CHECK-RV32C-NEXT: lw a2, 0(a0) -; CHECK-RV32C-NEXT: c.ntl.s1 ; CHECK-RV32C-NEXT: lw a1, 4(a0) -; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: c.ntl.s1 +; CHECK-RV32C-NEXT: lw a0, 0(a0) ; CHECK-RV32C-NEXT: ret ; ; CHECK-RV64V-LABEL: test_nontemporal_S1_load_i64: @@ -4306,10 +4289,9 @@ define i64 @test_nontemporal_S1_load_i64(ptr %p) { ; CHECK-RV32V-LABEL: test_nontemporal_S1_load_i64: ; CHECK-RV32V: # %bb.0: ; CHECK-RV32V-NEXT: ntl.s1 -; CHECK-RV32V-NEXT: lw a2, 0(a0) -; CHECK-RV32V-NEXT: ntl.s1 ; CHECK-RV32V-NEXT: lw a1, 4(a0) -; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ntl.s1 +; CHECK-RV32V-NEXT: lw a0, 0(a0) ; CHECK-RV32V-NEXT: ret %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !3 ret i64 %1 @@ -4790,10 +4772,9 @@ define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) { ; CHECK-RV64-LABEL: test_nontemporal_S1_load_v2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ntl.s1 -; CHECK-RV64-NEXT: ld a2, 0(a0) -; CHECK-RV64-NEXT: ntl.s1 ; CHECK-RV64-NEXT: ld a1, 8(a0) -; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ntl.s1 +; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_S1_load_v2i64: @@ -4815,10 +4796,9 @@ define <2 x i64> @test_nontemporal_S1_load_v2i64(ptr %p) { ; CHECK-RV64C-LABEL: test_nontemporal_S1_load_v2i64: ; CHECK-RV64C: # %bb.0: ; CHECK-RV64C-NEXT: c.ntl.s1 -; CHECK-RV64C-NEXT: ld a2, 0(a0) -; CHECK-RV64C-NEXT: c.ntl.s1 ; CHECK-RV64C-NEXT: ld a1, 8(a0) -; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: c.ntl.s1 +; CHECK-RV64C-NEXT: ld a0, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_S1_load_v2i64: @@ -5690,10 +5670,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) { ; CHECK-RV32-LABEL: test_nontemporal_ALL_load_i64: ; CHECK-RV32: # %bb.0: ; CHECK-RV32-NEXT: ntl.all -; CHECK-RV32-NEXT: lw a2, 0(a0) -; CHECK-RV32-NEXT: ntl.all ; CHECK-RV32-NEXT: lw a1, 4(a0) -; CHECK-RV32-NEXT: mv a0, a2 +; CHECK-RV32-NEXT: ntl.all +; CHECK-RV32-NEXT: lw a0, 0(a0) ; CHECK-RV32-NEXT: ret ; ; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_i64: @@ -5705,10 +5684,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) { ; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_i64: ; CHECK-RV32C: # %bb.0: ; CHECK-RV32C-NEXT: c.ntl.all -; CHECK-RV32C-NEXT: lw a2, 0(a0) -; CHECK-RV32C-NEXT: c.ntl.all ; CHECK-RV32C-NEXT: lw a1, 4(a0) -; CHECK-RV32C-NEXT: mv a0, a2 +; CHECK-RV32C-NEXT: c.ntl.all +; CHECK-RV32C-NEXT: lw a0, 0(a0) ; CHECK-RV32C-NEXT: ret ; ; CHECK-RV64V-LABEL: test_nontemporal_ALL_load_i64: @@ -5720,10 +5698,9 @@ define i64 @test_nontemporal_ALL_load_i64(ptr %p) { ; CHECK-RV32V-LABEL: test_nontemporal_ALL_load_i64: ; CHECK-RV32V: # %bb.0: ; CHECK-RV32V-NEXT: ntl.all -; CHECK-RV32V-NEXT: lw a2, 0(a0) -; CHECK-RV32V-NEXT: ntl.all ; CHECK-RV32V-NEXT: lw a1, 4(a0) -; CHECK-RV32V-NEXT: mv a0, a2 +; CHECK-RV32V-NEXT: ntl.all +; CHECK-RV32V-NEXT: lw a0, 0(a0) ; CHECK-RV32V-NEXT: ret %1 = load i64, ptr %p, !nontemporal !0, !riscv-nontemporal-domain !4 ret i64 %1 @@ -6204,10 +6181,9 @@ define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) { ; CHECK-RV64-LABEL: test_nontemporal_ALL_load_v2i64: ; CHECK-RV64: # %bb.0: ; CHECK-RV64-NEXT: ntl.all -; CHECK-RV64-NEXT: ld a2, 0(a0) -; CHECK-RV64-NEXT: ntl.all ; CHECK-RV64-NEXT: ld a1, 8(a0) -; CHECK-RV64-NEXT: mv a0, a2 +; CHECK-RV64-NEXT: ntl.all +; CHECK-RV64-NEXT: ld a0, 0(a0) ; CHECK-RV64-NEXT: ret ; ; CHECK-RV32-LABEL: test_nontemporal_ALL_load_v2i64: @@ -6229,10 +6205,9 @@ define <2 x i64> @test_nontemporal_ALL_load_v2i64(ptr %p) { ; CHECK-RV64C-LABEL: test_nontemporal_ALL_load_v2i64: ; CHECK-RV64C: # %bb.0: ; CHECK-RV64C-NEXT: c.ntl.all -; CHECK-RV64C-NEXT: ld a2, 0(a0) -; CHECK-RV64C-NEXT: c.ntl.all ; CHECK-RV64C-NEXT: ld a1, 8(a0) -; CHECK-RV64C-NEXT: mv a0, a2 +; CHECK-RV64C-NEXT: c.ntl.all +; CHECK-RV64C-NEXT: ld a0, 0(a0) ; CHECK-RV64C-NEXT: ret ; ; CHECK-RV32C-LABEL: test_nontemporal_ALL_load_v2i64: diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 4bb65f376218f..67143336de477 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -686,10 +686,10 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) { ; RV32-LABEL: uaddo_i64_decrement_alt: ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: seqz a4, a0 ; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: snez a3, a3 ; RV32-NEXT: sw a0, 0(a2) ; RV32-NEXT: sw a1, 4(a2) ; RV32-NEXT: mv a0, a3 @@ -714,10 +714,10 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) { ; RV32-LABEL: uaddo_i64_decrement_alt_dom: ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: seqz a4, a0 ; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: snez a3, a3 ; RV32-NEXT: sw a0, 0(a2) ; RV32-NEXT: sw a1, 4(a2) ; RV32-NEXT: mv a0, a3 @@ -830,10 +830,9 @@ define i1 @usubo_ugt_i32(i32 %x, i32 %y, ptr %p) { ; RV64: # %bb.0: ; RV64-NEXT: sext.w a3, a1 ; RV64-NEXT: sext.w a4, a0 -; RV64-NEXT: sltu a3, a4, a3 ; RV64-NEXT: subw a0, a0, a1 ; RV64-NEXT: sw a0, 0(a2) -; RV64-NEXT: mv a0, a3 +; RV64-NEXT: sltu a0, a4, a3 ; RV64-NEXT: ret %ov = icmp ugt i32 %y, %x %s = sub i32 %x, %y @@ -929,19 +928,17 @@ define i1 @usubo_ugt_constant_op1_i8(i8 %x, ptr %p) { ; RV32-LABEL: usubo_ugt_constant_op1_i8: ; RV32: # %bb.0: ; RV32-NEXT: andi a2, a0, 255 -; RV32-NEXT: sltiu a2, a2, 45 ; RV32-NEXT: addi a0, a0, -45 ; RV32-NEXT: sb a0, 0(a1) -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: sltiu a0, a2, 45 ; RV32-NEXT: ret ; ; RV64-LABEL: usubo_ugt_constant_op1_i8: ; RV64: # %bb.0: ; RV64-NEXT: andi a2, a0, 255 -; RV64-NEXT: sltiu a2, a2, 45 ; RV64-NEXT: addi a0, a0, -45 ; RV64-NEXT: sb a0, 0(a1) -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: sltiu a0, a2, 45 ; RV64-NEXT: ret %ov = icmp ugt i8 45, %x %s = add i8 %x, -45 diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll index 4e958f5699adb..5fe5a0eda46b8 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll @@ -153,10 +153,10 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: srli a1, a0, 1 ; CHECK-NEXT: not a6, a2 ; CHECK-NEXT: srl a3, a1, a6 -; CHECK-NEXT: or a3, a5, a3 ; CHECK-NEXT: sll a0, a0, a2 ; CHECK-NEXT: srli a4, a4, 1 ; CHECK-NEXT: srl a1, a4, a6 +; CHECK-NEXT: or a3, a5, a3 ; CHECK-NEXT: or a1, a0, a1 ; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: ret @@ -252,11 +252,10 @@ define i64 @rori_i64(i64 %a) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: srli a2, a0, 1 ; CHECK-NEXT: slli a3, a1, 31 -; CHECK-NEXT: or a2, a3, a2 ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: slli a0, a0, 31 ; CHECK-NEXT: or a1, a0, a1 -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: or a0, a3, a2 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63) ret i64 %1 @@ -267,11 +266,10 @@ define i64 @rori_i64_fshr(i64 %a) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: srli a2, a1, 31 ; CHECK-NEXT: slli a3, a0, 1 -; CHECK-NEXT: or a2, a3, a2 ; CHECK-NEXT: srli a0, a0, 31 ; CHECK-NEXT: slli a1, a1, 1 ; CHECK-NEXT: or a1, a1, a0 -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: or a0, a3, a2 ; CHECK-NEXT: ret %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63) ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll new file mode 100644 index 0000000000000..15aa11670e126 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll @@ -0,0 +1,2603 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs \ +; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs \ +; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZBA +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs \ +; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZICOND + +; +; Get the actual value of the overflow bit. +; +define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: saddo1.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a3, a0, a1 +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: xor a3, a1, a3 +; RV64-NEXT: snez a0, a3 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo1.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a3, a0, a1 +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: xor a3, a1, a3 +; RV64ZBA-NEXT: snez a0, a3 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo1.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a3, a0, a1 +; RV64ZICOND-NEXT: add a1, a0, a1 +; RV64ZICOND-NEXT: xor a3, a1, a3 +; RV64ZICOND-NEXT: snez a0, a3 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +; Test the immediate version. +define zeroext i1 @saddo2.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: saddo2.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, 4 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, 4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo2.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, 4 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +; Test negative immediates. +define zeroext i1 @saddo3.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: saddo3.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, -4 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo3.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, -4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo3.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, -4 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +; Test immediates that are too large to be encoded. +define zeroext i1 @saddo4.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: saddo4.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: lui a2, 4096 +; RV64-NEXT: addi a2, a2, -1 +; RV64-NEXT: addw a2, a0, a2 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo4.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: lui a2, 4096 +; RV64ZBA-NEXT: addi a2, a2, -1 +; RV64ZBA-NEXT: addw a2, a0, a2 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo4.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: lui a2, 4096 +; RV64ZICOND-NEXT: addi a2, a2, -1 +; RV64ZICOND-NEXT: addw a2, a0, a2 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: saddo1.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: slt a0, a3, a0 +; RV64-NEXT: slti a1, a1, 0 +; RV64-NEXT: xor a0, a1, a0 +; RV64-NEXT: sd a3, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo1.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a3, a0, a1 +; RV64ZBA-NEXT: slt a0, a3, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: xor a0, a1, a0 +; RV64ZBA-NEXT: sd a3, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo1.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a3, a0, a1 +; RV64ZICOND-NEXT: slt a0, a3, a0 +; RV64ZICOND-NEXT: slti a1, a1, 0 +; RV64ZICOND-NEXT: xor a0, a1, a0 +; RV64ZICOND-NEXT: sd a3, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) { +; RV64-LABEL: saddo2.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, 4 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo2.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, 4 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) { +; RV64-LABEL: saddo3.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, -4 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo3.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, -4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo3.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, -4 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: uaddo.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a1, a0 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) { +; RV64-LABEL: uaddo.i32.constant: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, -2 +; RV64-NEXT: sltu a0, a2, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i32.constant: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, -2 +; RV64ZBA-NEXT: sltu a0, a2, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i32.constant: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, -2 +; RV64ZICOND-NEXT: sltu a0, a2, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i32.constant_one(i32 signext %v1, ptr %res) { +; RV64-LABEL: uaddo.i32.constant_one: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, 1 +; RV64-NEXT: seqz a0, a2 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i32.constant_one: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, 1 +; RV64ZBA-NEXT: seqz a0, a2 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i32.constant_one: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, 1 +; RV64ZICOND-NEXT: seqz a0, a2 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: uaddo.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: sd a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a1, a0 +; RV64ZICOND-NEXT: sd a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i64.constant_one(i64 %v1, ptr %res) { +; RV64-LABEL: uaddo.i64.constant_one: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, 1 +; RV64-NEXT: seqz a0, a2 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64.constant_one: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 1 +; RV64ZBA-NEXT: seqz a0, a2 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64.constant_one: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, 1 +; RV64ZICOND-NEXT: seqz a0, a2 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: ssubo1.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a3, a0, a1 +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: xor a3, a1, a3 +; RV64-NEXT: snez a0, a3 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo1.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a3, a0, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: xor a3, a1, a3 +; RV64ZBA-NEXT: snez a0, a3 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo1.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a3, a0, a1 +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: xor a3, a1, a3 +; RV64ZICOND-NEXT: snez a0, a3 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @ssubo2.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: ssubo2.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, 4 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, 4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo2.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, 4 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: ssubo.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sgtz a3, a1 +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: xor a0, a3, a0 +; RV64-NEXT: sd a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a3, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: xor a0, a3, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sgtz a3, a1 +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: xor a0, a3, a0 +; RV64ZICOND-NEXT: sd a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: usubo.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a1, a0, a1 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a0, a1 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) { +; RV64-LABEL: usubo.i32.constant.rhs: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addiw a2, a0, 2 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i32.constant.rhs: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addiw a2, a0, 2 +; RV64ZBA-NEXT: sltu a0, a0, a2 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.i32.constant.rhs: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addiw a2, a0, 2 +; RV64ZICOND-NEXT: sltu a0, a0, a2 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) { +; RV64-LABEL: usubo.i32.constant.lhs: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, -2 +; RV64-NEXT: subw a2, a2, a0 +; RV64-NEXT: addi a0, a2, 1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i32.constant.lhs: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a2, -2 +; RV64ZBA-NEXT: subw a2, a2, a0 +; RV64ZBA-NEXT: addi a0, a2, 1 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.i32.constant.lhs: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a2, -2 +; RV64ZICOND-NEXT: subw a2, a2, a0 +; RV64ZICOND-NEXT: addi a0, a2, 1 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: usubo.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: sd a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a0, a1 +; RV64ZICOND-NEXT: sd a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @smulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: smulo.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulw a3, a0, a1 +; RV64-NEXT: mul a1, a0, a1 +; RV64-NEXT: xor a3, a1, a3 +; RV64-NEXT: snez a0, a3 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulw a3, a0, a1 +; RV64ZBA-NEXT: mul a1, a0, a1 +; RV64ZBA-NEXT: xor a3, a1, a3 +; RV64ZBA-NEXT: snez a0, a3 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulw a3, a0, a1 +; RV64ZICOND-NEXT: mul a1, a0, a1 +; RV64ZICOND-NEXT: xor a3, a1, a3 +; RV64ZICOND-NEXT: snez a0, a3 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: smulo2.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mulw a3, a0, a2 +; RV64-NEXT: mul a2, a0, a2 +; RV64-NEXT: xor a3, a2, a3 +; RV64-NEXT: snez a0, a3 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sh1add a2, a0, a0 +; RV64ZBA-NEXT: sh2add a2, a2, a0 +; RV64ZBA-NEXT: sext.w a0, a2 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo2.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a2, 13 +; RV64ZICOND-NEXT: mulw a3, a0, a2 +; RV64ZICOND-NEXT: mul a2, a0, a2 +; RV64ZICOND-NEXT: xor a3, a2, a3 +; RV64ZICOND-NEXT: snez a0, a3 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: smulo.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulh a3, a0, a1 +; RV64-NEXT: mul a1, a0, a1 +; RV64-NEXT: srai a0, a1, 63 +; RV64-NEXT: xor a0, a3, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: sd a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a3, a0, a1 +; RV64ZBA-NEXT: mul a1, a0, a1 +; RV64ZBA-NEXT: srai a0, a1, 63 +; RV64ZBA-NEXT: xor a0, a3, a0 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulh a3, a0, a1 +; RV64ZICOND-NEXT: mul a1, a0, a1 +; RV64ZICOND-NEXT: srai a0, a1, 63 +; RV64ZICOND-NEXT: xor a0, a3, a0 +; RV64ZICOND-NEXT: snez a0, a0 +; RV64ZICOND-NEXT: sd a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { +; RV64-LABEL: smulo2.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mulh a3, a0, a2 +; RV64-NEXT: mul a2, a0, a2 +; RV64-NEXT: srai a0, a2, 63 +; RV64-NEXT: xor a0, a3, a0 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a2, 13 +; RV64ZBA-NEXT: mulh a2, a0, a2 +; RV64ZBA-NEXT: sh1add a3, a0, a0 +; RV64ZBA-NEXT: sh2add a3, a3, a0 +; RV64ZBA-NEXT: srai a0, a3, 63 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sd a3, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo2.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a2, 13 +; RV64ZICOND-NEXT: mulh a3, a0, a2 +; RV64ZICOND-NEXT: mul a2, a0, a2 +; RV64ZICOND-NEXT: srai a0, a2, 63 +; RV64ZICOND-NEXT: xor a0, a3, a0 +; RV64ZICOND-NEXT: snez a0, a0 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { +; RV64-LABEL: umulo.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: mulhu a1, a0, a1 +; RV64-NEXT: srai a0, a1, 32 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a1, a0, a1 +; RV64ZBA-NEXT: srai a0, a1, 32 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: slli a1, a1, 32 +; RV64ZICOND-NEXT: slli a0, a0, 32 +; RV64ZICOND-NEXT: mulhu a1, a0, a1 +; RV64ZICOND-NEXT: srai a0, a1, 32 +; RV64ZICOND-NEXT: snez a0, a0 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { +; RV64-LABEL: umulo2.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a2, 13 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: mulhu a2, a0, a2 +; RV64-NEXT: srli a0, a2, 32 +; RV64-NEXT: snez a0, a0 +; RV64-NEXT: sw a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a2, a0 +; RV64ZBA-NEXT: sh1add.uw a0, a0, a2 +; RV64ZBA-NEXT: sh2add a2, a0, a2 +; RV64ZBA-NEXT: srli a0, a2, 32 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo2.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a2, 13 +; RV64ZICOND-NEXT: slli a2, a2, 32 +; RV64ZICOND-NEXT: slli a0, a0, 32 +; RV64ZICOND-NEXT: mulhu a2, a0, a2 +; RV64ZICOND-NEXT: srli a0, a2, 32 +; RV64ZICOND-NEXT: snez a0, a0 +; RV64ZICOND-NEXT: sw a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + store i32 %val, ptr %res + ret i1 %obit +} + +; Similar to umulo.i32, but storing the overflow and returning the result. +define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) { +; RV64-LABEL: umulo3.i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: srai a1, a0, 32 +; RV64-NEXT: snez a1, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo3.i32: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: srai a3, a3, 32 +; RV64ZBA-NEXT: snez a3, a3 +; RV64ZBA-NEXT: mulw a0, a0, a1 +; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo3.i32: +; RV64ZICOND: # %bb.0: +; RV64ZICOND-NEXT: slli a1, a1, 32 +; RV64ZICOND-NEXT: slli a0, a0, 32 +; RV64ZICOND-NEXT: mulhu a0, a0, a1 +; RV64ZICOND-NEXT: srai a1, a0, 32 +; RV64ZICOND-NEXT: snez a1, a1 +; RV64ZICOND-NEXT: sext.w a0, a0 +; RV64ZICOND-NEXT: sw a1, 0(a2) +; RV64ZICOND-NEXT: ret + %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1) + %5 = extractvalue { i32, i1 } %4, 1 + %6 = extractvalue { i32, i1 } %4, 0 + %7 = zext i1 %5 to i32 + store i32 %7, ptr %2, align 4 + ret i32 %6 +} + +define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { +; RV64-LABEL: umulo.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulhu a3, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: sd a0, 0(a2) +; RV64-NEXT: snez a0, a3 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a3, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: sd a0, 0(a2) +; RV64ZBA-NEXT: snez a0, a3 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulhu a3, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: sd a0, 0(a2) +; RV64ZICOND-NEXT: snez a0, a3 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { +; RV64-LABEL: umulo2.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a3, 13 +; RV64-NEXT: mulhu a2, a0, a3 +; RV64-NEXT: mul a0, a0, a3 +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: snez a0, a2 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a2, 13 +; RV64ZBA-NEXT: mulhu a2, a0, a2 +; RV64ZBA-NEXT: sh1add a3, a0, a0 +; RV64ZBA-NEXT: sh2add a0, a3, a0 +; RV64ZBA-NEXT: sd a0, 0(a1) +; RV64ZBA-NEXT: snez a0, a2 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo2.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a3, 13 +; RV64ZICOND-NEXT: mulhu a2, a0, a3 +; RV64ZICOND-NEXT: mul a0, a0, a3 +; RV64ZICOND-NEXT: sd a0, 0(a1) +; RV64ZICOND-NEXT: snez a0, a2 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + + +; +; Check the use of the overflow bit in combination with a select instruction. +; +define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: saddo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a2, a0, a1 +; RV64-NEXT: add a3, a0, a1 +; RV64-NEXT: bne a3, a2, .LBB28_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB28_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a2, a0, a1 +; RV64ZBA-NEXT: add a3, a0, a1 +; RV64ZBA-NEXT: bne a3, a2, .LBB28_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB28_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a2, a0, a1 +; RV64ZICOND-NEXT: add a3, a0, a1 +; RV64ZICOND-NEXT: xor a2, a3, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: saddo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a2, a0, a1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a2, a0, a1 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a2, a0, a1 +; RV64ZICOND-NEXT: add a0, a0, a1 +; RV64ZICOND-NEXT: xor a0, a0, a2 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @saddo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: saddo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a2, a0, a1 +; RV64-NEXT: slt a2, a2, a0 +; RV64-NEXT: slti a3, a1, 0 +; RV64-NEXT: bne a3, a2, .LBB30_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB30_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a2, a2, a0 +; RV64ZBA-NEXT: slti a3, a1, 0 +; RV64ZBA-NEXT: bne a3, a2, .LBB30_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB30_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a2, a0, a1 +; RV64ZICOND-NEXT: slt a2, a2, a0 +; RV64ZICOND-NEXT: slti a3, a1, 0 +; RV64ZICOND-NEXT: xor a2, a3, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @saddo.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: saddo.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a2, a0, a1 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: slti a1, a1, 0 +; RV64-NEXT: xor a0, a1, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: xor a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a2, a0, a1 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: slti a1, a1, 0 +; RV64ZICOND-NEXT: xor a0, a1, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: uaddo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a2, a0, a1 +; RV64-NEXT: bltu a2, a0, .LBB32_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB32_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a2, a0, a1 +; RV64ZBA-NEXT: bltu a2, a0, .LBB32_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB32_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a2, a0, a1 +; RV64ZICOND-NEXT: sltu a2, a2, a0 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: uaddo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a1, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: uaddo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a2, a0, a1 +; RV64-NEXT: bltu a2, a0, .LBB34_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB34_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: bltu a2, a0, .LBB34_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB34_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a2, a0, a1 +; RV64ZICOND-NEXT: sltu a2, a2, a0 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: uaddo.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: sltu a0, a1, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a1, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: ssubo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a2, a0, a1 +; RV64-NEXT: sub a3, a0, a1 +; RV64-NEXT: bne a3, a2, .LBB36_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB36_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a2, a0, a1 +; RV64ZBA-NEXT: sub a3, a0, a1 +; RV64ZBA-NEXT: bne a3, a2, .LBB36_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB36_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a2, a0, a1 +; RV64ZICOND-NEXT: sub a3, a0, a1 +; RV64ZICOND-NEXT: xor a2, a3, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: ssubo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a2, a0, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a2, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a2, a0, a1 +; RV64ZICOND-NEXT: sub a0, a0, a1 +; RV64ZICOND-NEXT: xor a0, a0, a2 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: ssubo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sgtz a2, a1 +; RV64-NEXT: sub a3, a0, a1 +; RV64-NEXT: slt a3, a3, a0 +; RV64-NEXT: bne a2, a3, .LBB38_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB38_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a3, a0, a1 +; RV64ZBA-NEXT: slt a3, a3, a0 +; RV64ZBA-NEXT: bne a2, a3, .LBB38_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB38_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sgtz a2, a1 +; RV64ZICOND-NEXT: sub a3, a0, a1 +; RV64ZICOND-NEXT: slt a3, a3, a0 +; RV64ZICOND-NEXT: xor a2, a2, a3 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @ssub.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: ssub.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sgtz a2, a1 +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssub.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssub.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sgtz a2, a1 +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: xor a0, a2, a0 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: usubo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a2, a0, a1 +; RV64-NEXT: bltu a0, a2, .LBB40_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB40_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a2, a0, a1 +; RV64ZBA-NEXT: bltu a0, a2, .LBB40_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB40_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a2, a0, a1 +; RV64ZICOND-NEXT: sltu a2, a0, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: usubo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a1, a0, a1 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a0, a1 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @usubo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: usubo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sub a2, a0, a1 +; RV64-NEXT: bltu a0, a2, .LBB42_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB42_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a2, a0, a1 +; RV64ZBA-NEXT: bltu a0, a2, .LBB42_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB42_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sub a2, a0, a1 +; RV64ZICOND-NEXT: sltu a2, a0, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @usubo.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: usubo.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: xori a0, a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: sltu a0, a0, a1 +; RV64ZICOND-NEXT: xori a0, a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i32 @smulo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: smulo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulw a2, a0, a1 +; RV64-NEXT: mul a3, a0, a1 +; RV64-NEXT: bne a3, a2, .LBB44_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB44_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulw a2, a0, a1 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: bne a3, a2, .LBB44_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB44_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulw a2, a0, a1 +; RV64ZICOND-NEXT: mul a3, a0, a1 +; RV64ZICOND-NEXT: xor a2, a3, a2 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: smulo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulw a2, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: xor a0, a0, a2 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulw a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulw a2, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: xor a0, a0, a2 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @smulo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: smulo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulh a2, a0, a1 +; RV64-NEXT: mul a3, a0, a1 +; RV64-NEXT: srai a3, a3, 63 +; RV64-NEXT: bne a2, a3, .LBB46_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB46_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: srai a3, a3, 63 +; RV64ZBA-NEXT: bne a2, a3, .LBB46_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB46_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulh a2, a0, a1 +; RV64ZICOND-NEXT: mul a3, a0, a1 +; RV64ZICOND-NEXT: srai a3, a3, 63 +; RV64ZICOND-NEXT: xor a2, a2, a3 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @smulo.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: smulo.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulh a2, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: xor a0, a2, a0 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulh a2, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: srai a0, a0, 63 +; RV64ZICOND-NEXT: xor a0, a2, a0 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i32 @umulo.select.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: umulo.select.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: slli a2, a1, 32 +; RV64-NEXT: slli a3, a0, 32 +; RV64-NEXT: mulhu a2, a3, a2 +; RV64-NEXT: srai a2, a2, 32 +; RV64-NEXT: bnez a2, .LBB48_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB48_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a2, a1 +; RV64ZBA-NEXT: zext.w a3, a0 +; RV64ZBA-NEXT: mul a2, a3, a2 +; RV64ZBA-NEXT: srai a2, a2, 32 +; RV64ZBA-NEXT: bnez a2, .LBB48_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB48_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.select.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: slli a2, a1, 32 +; RV64ZICOND-NEXT: slli a3, a0, 32 +; RV64ZICOND-NEXT: mulhu a2, a3, a2 +; RV64ZICOND-NEXT: srai a2, a2, 32 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: umulo.not.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 32 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.not.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: slli a1, a1, 32 +; RV64ZICOND-NEXT: slli a0, a0, 32 +; RV64ZICOND-NEXT: mulhu a0, a0, a1 +; RV64ZICOND-NEXT: srai a0, a0, 32 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + +define i64 @umulo.select.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: umulo.select.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulhu a2, a0, a1 +; RV64-NEXT: bnez a2, .LBB50_2 +; RV64-NEXT: # %bb.1: # %entry +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB50_2: # %entry +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a2, a0, a1 +; RV64ZBA-NEXT: bnez a2, .LBB50_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB50_2: # %entry +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.select.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulhu a2, a0, a1 +; RV64ZICOND-NEXT: czero.nez a1, a1, a2 +; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 +; RV64ZICOND-NEXT: or a0, a0, a1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i1 @umulo.not.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: umulo.not.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: seqz a0, a0 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.not.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulhu a0, a0, a1 +; RV64ZICOND-NEXT: seqz a0, a0 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = xor i1 %obit, true + ret i1 %ret +} + + +; +; Check the use of the overflow bit in combination with a branch instruction. +; +define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: saddo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a2, a0, a1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: beq a0, a2, .LBB52_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB52_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a2, a0, a1 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB52_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB52_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a2, a0, a1 +; RV64ZICOND-NEXT: add a0, a0, a1 +; RV64ZICOND-NEXT: beq a0, a2, .LBB52_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB52_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: saddo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a2, a0, a1 +; RV64-NEXT: slt a0, a2, a0 +; RV64-NEXT: slti a1, a1, 0 +; RV64-NEXT: beq a1, a0, .LBB53_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB53_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: saddo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: beq a1, a0, .LBB53_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB53_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: saddo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a2, a0, a1 +; RV64ZICOND-NEXT: slt a0, a2, a0 +; RV64ZICOND-NEXT: slti a1, a1, 0 +; RV64ZICOND-NEXT: beq a1, a0, .LBB53_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB53_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { +; RV64-LABEL: uaddo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addw a1, a0, a1 +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: bgeu a1, a0, .LBB54_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB54_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a1, a0, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB54_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB54_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addw a1, a0, a1 +; RV64ZICOND-NEXT: sext.w a0, a0 +; RV64ZICOND-NEXT: bgeu a1, a0, .LBB54_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB54_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: uaddo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: bgeu a1, a0, .LBB55_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB55_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB55_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB55_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a1, a0, a1 +; RV64ZICOND-NEXT: bgeu a1, a0, .LBB55_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB55_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: ssubo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a2, a0, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: beq a0, a2, .LBB56_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB56_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a2, a0, a1 +; RV64ZBA-NEXT: sub a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB56_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB56_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a2, a0, a1 +; RV64ZICOND-NEXT: sub a0, a0, a1 +; RV64ZICOND-NEXT: beq a0, a2, .LBB56_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB56_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: ssubo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sgtz a2, a1 +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: slt a0, a1, a0 +; RV64-NEXT: beq a2, a0, .LBB57_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB57_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: beq a2, a0, .LBB57_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB57_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: ssubo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sgtz a2, a1 +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: slt a0, a1, a0 +; RV64ZICOND-NEXT: beq a2, a0, .LBB57_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB57_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: usubo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: subw a1, a0, a1 +; RV64-NEXT: bgeu a0, a1, .LBB58_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB58_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a1, a0, a1 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB58_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB58_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: subw a1, a0, a1 +; RV64ZICOND-NEXT: bgeu a0, a1, .LBB58_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB58_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: usubo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: sub a1, a0, a1 +; RV64-NEXT: bgeu a0, a1, .LBB59_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB59_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: usubo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB59_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB59_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: usubo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: sub a1, a0, a1 +; RV64ZICOND-NEXT: bgeu a0, a1, .LBB59_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB59_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @smulo.br.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: smulo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulw a2, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: beq a0, a2, .LBB60_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB60_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulw a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB60_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB60_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulw a2, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: beq a0, a2, .LBB60_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB60_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: smulo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulh a2, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: beq a2, a0, .LBB61_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB61_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: beq a2, a0, .LBB61_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB61_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulh a2, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: srai a0, a0, 63 +; RV64ZICOND-NEXT: beq a2, a0, .LBB61_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB61_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @smulo2.br.i64(i64 %v1) { +; RV64-LABEL: smulo2.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: li a1, -13 +; RV64-NEXT: mulh a2, a0, a1 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: srai a0, a0, 63 +; RV64-NEXT: beq a2, a0, .LBB62_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB62_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: li a1, -13 +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: beq a2, a0, .LBB62_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB62_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: smulo2.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: li a1, -13 +; RV64ZICOND-NEXT: mulh a2, a0, a1 +; RV64ZICOND-NEXT: mul a0, a0, a1 +; RV64ZICOND-NEXT: srai a0, a0, 63 +; RV64ZICOND-NEXT: beq a2, a0, .LBB62_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB62_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @umulo.br.i32(i32 signext %v1, i32 signext %v2) { +; RV64-LABEL: umulo.br.i32: +; RV64: # %bb.0: # %entry +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: srai a0, a0, 32 +; RV64-NEXT: beqz a0, .LBB63_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB63_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 32 +; RV64ZBA-NEXT: beqz a0, .LBB63_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB63_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.br.i32: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: slli a1, a1, 32 +; RV64ZICOND-NEXT: slli a0, a0, 32 +; RV64ZICOND-NEXT: mulhu a0, a0, a1 +; RV64ZICOND-NEXT: srai a0, a0, 32 +; RV64ZICOND-NEXT: beqz a0, .LBB63_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB63_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { +; RV64-LABEL: umulo.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: beqz a0, .LBB64_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB64_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: beqz a0, .LBB64_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB64_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: mulhu a0, a0, a1 +; RV64ZICOND-NEXT: beqz a0, .LBB64_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB64_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @umulo2.br.i64(i64 %v1) { +; RV64-LABEL: umulo2.br.i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: add a1, a0, a0 +; RV64-NEXT: bgeu a1, a0, .LBB65_2 +; RV64-NEXT: # %bb.1: # %overflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB65_2: # %continue +; RV64-NEXT: li a0, 1 +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a0 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB65_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB65_2: # %continue +; RV64ZBA-NEXT: li a0, 1 +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: umulo2.br.i64: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: add a1, a0, a0 +; RV64ZICOND-NEXT: bgeu a1, a0, .LBB65_2 +; RV64ZICOND-NEXT: # %bb.1: # %overflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: ret +; RV64ZICOND-NEXT: .LBB65_2: # %continue +; RV64ZICOND-NEXT: li a0, 1 +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) { +; RV64-LABEL: uaddo.i64.constant: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, 2 +; RV64-NEXT: sltu a0, a2, a0 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64.constant: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 2 +; RV64ZBA-NEXT: sltu a0, a2, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64.constant: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, 2 +; RV64ZICOND-NEXT: sltu a0, a2, a0 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) { +; RV64-LABEL: uaddo.i64.constant_2048: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, 2047 +; RV64-NEXT: addi a2, a2, 1 +; RV64-NEXT: sltu a0, a2, a0 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64.constant_2048: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 2047 +; RV64ZBA-NEXT: addi a2, a2, 1 +; RV64ZBA-NEXT: sltu a0, a2, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64.constant_2048: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, 2047 +; RV64ZICOND-NEXT: addi a2, a2, 1 +; RV64ZICOND-NEXT: sltu a0, a2, a0 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2048) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) { +; RV64-LABEL: uaddo.i64.constant_2049: +; RV64: # %bb.0: # %entry +; RV64-NEXT: addi a2, a0, 2047 +; RV64-NEXT: addi a2, a2, 2 +; RV64-NEXT: sltu a0, a2, a0 +; RV64-NEXT: sd a2, 0(a1) +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64.constant_2049: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 2047 +; RV64ZBA-NEXT: addi a2, a2, 2 +; RV64ZBA-NEXT: sltu a0, a2, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64.constant_2049: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: addi a2, a0, 2047 +; RV64ZICOND-NEXT: addi a2, a2, 2 +; RV64ZICOND-NEXT: sltu a0, a2, a0 +; RV64ZICOND-NEXT: sd a2, 0(a1) +; RV64ZICOND-NEXT: ret +entry: + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + store i64 %val, ptr %res + ret i1 %obit +} + +define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) { +; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: +; RV64: # %bb.0: # %entry +; RV64-NEXT: ld a1, 0(a0) +; RV64-NEXT: addi a0, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB69_2 +; RV64-NEXT: # %bb.1: # %IfOverflow +; RV64-NEXT: li a0, 0 +; RV64-NEXT: .LBB69_2: # %IfNoOverflow +; RV64-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: ld a1, 0(a0) +; RV64ZBA-NEXT: addi a0, a1, 2 +; RV64ZBA-NEXT: bltu a0, a1, .LBB69_2 +; RV64ZBA-NEXT: # %bb.1: # %IfOverflow +; RV64ZBA-NEXT: li a0, 0 +; RV64ZBA-NEXT: .LBB69_2: # %IfNoOverflow +; RV64ZBA-NEXT: ret +; +; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: +; RV64ZICOND: # %bb.0: # %entry +; RV64ZICOND-NEXT: ld a1, 0(a0) +; RV64ZICOND-NEXT: addi a0, a1, 2 +; RV64ZICOND-NEXT: bltu a0, a1, .LBB69_2 +; RV64ZICOND-NEXT: # %bb.1: # %IfOverflow +; RV64ZICOND-NEXT: li a0, 0 +; RV64ZICOND-NEXT: .LBB69_2: # %IfNoOverflow +; RV64ZICOND-NEXT: ret +entry: + %v1 = load i64, ptr %p + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %IfNoOverflow, label %IfOverflow +IfOverflow: + ret i64 0 +IfNoOverflow: + ret i64 %val +} + +declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone +declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone +declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone diff --git a/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll b/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll index 2fa344d4d79a7..15c1f4325f117 100644 --- a/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/RISCV/rv64-statepoint-call-lowering.ll @@ -186,9 +186,8 @@ define i1 @test_cross_bb(ptr addrspace(1) %a, i1 %external_cond) gc "statepoint- ; CHECK-NEXT: .Ltmp8: ; CHECK-NEXT: beqz s0, .LBB8_2 ; CHECK-NEXT: # %bb.1: # %left -; CHECK-NEXT: ld a1, 8(sp) ; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: ld a0, 8(sp) ; CHECK-NEXT: call consume ; CHECK-NEXT: mv a0, s0 ; CHECK-NEXT: j .LBB8_3 diff --git a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll index 7839b602706db..3c0888fa17036 100644 --- a/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll +++ b/llvm/test/CodeGen/RISCV/rvv/constant-folding-crash.ll @@ -23,9 +23,8 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan ; RV32-NEXT: seqz a0, a0 ; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV32-NEXT: vmv.v.x v10, a0 -; RV32-NEXT: vmsne.vi v10, v10, 0 ; RV32-NEXT: vmv1r.v v11, v0 -; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vmsne.vi v0, v10, 0 ; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV32-NEXT: vmv.x.s a0, v8 @@ -47,9 +46,8 @@ define void @constant_folding_crash(ptr %v54, <4 x ptr> %lanes.a, <4 x ptr> %lan ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma ; RV64-NEXT: vmv.v.x v12, a0 -; RV64-NEXT: vmsne.vi v12, v12, 0 ; RV64-NEXT: vmv1r.v v13, v0 -; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vmsne.vi v0, v12, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma ; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 ; RV64-NEXT: vmv.x.s a0, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 9f8de22b25c2d..27a5773e64043 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -26,10 +26,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vadd.vi v12, v11, -16 ; CHECK-NEXT: vrgather.vv v9, v8, v12, v0.t -; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vrgather.vv v13, v10, v12 ; CHECK-NEXT: vadd.vi v10, v11, -15 +; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vrgather.vv v13, v8, v10, v0.t ; CHECK-NEXT: vmsne.vi v8, v13, 0 ; CHECK-NEXT: vmv.v.v v0, v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll index 51eb63f5f9221..6acda17e412b3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll @@ -52,9 +52,8 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -68,9 +67,8 @@ define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma @@ -124,9 +122,8 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -140,9 +137,8 @@ define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma @@ -198,9 +194,8 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -214,8 +209,8 @@ define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14 @@ -274,8 +269,8 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v14 @@ -290,8 +285,8 @@ define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20 @@ -326,9 +321,8 @@ define <2 x float> @vfmax_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -360,9 +354,8 @@ define <4 x float> @vfmax_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -396,8 +389,8 @@ define <8 x float> @vfmax_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -432,8 +425,8 @@ define <16 x float> @vfmax_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -466,9 +459,8 @@ define <2 x double> @vfmax_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -502,8 +494,8 @@ define <4 x double> @vfmax_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -538,8 +530,8 @@ define <8 x double> @vfmax_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -587,8 +579,8 @@ define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double> ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v24 @@ -767,8 +759,8 @@ define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 ; CHECK-NEXT: vfmax.vv v16, v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll index 02c2fafc89785..d60a4ff4391d7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum.ll @@ -15,9 +15,8 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -29,9 +28,8 @@ define <2 x half> @vfmax_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -48,9 +46,8 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -62,9 +59,8 @@ define <4 x half> @vfmax_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -81,9 +77,8 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -95,8 +90,8 @@ define <8 x half> @vfmax_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14 @@ -114,8 +109,8 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v14 @@ -128,8 +123,8 @@ define <16 x half> @vfmax_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20 @@ -147,9 +142,8 @@ define <2 x float> @vfmax_v2f32_vv(<2 x float> %a, <2 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -164,9 +158,8 @@ define <4 x float> @vfmax_v4f32_vv(<4 x float> %a, <4 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -181,8 +174,8 @@ define <8 x float> @vfmax_v8f32_vv(<8 x float> %a, <8 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -198,8 +191,8 @@ define <16 x float> @vfmax_v16f32_vv(<16 x float> %a, <16 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -215,9 +208,8 @@ define <2 x double> @vfmax_v2f64_vv(<2 x double> %a, <2 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -232,8 +224,8 @@ define <4 x double> @vfmax_v4f64_vv(<4 x double> %a, <4 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -249,8 +241,8 @@ define <8 x double> @vfmax_v8f64_vv(<8 x double> %a, <8 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -266,8 +258,8 @@ define <16 x double> @vfmax_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v24 @@ -304,9 +296,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfadd.vv v10, v8, v8 ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 -; ZVFH-NEXT: vmfeq.vv v8, v10, v10 ; ZVFH-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFH-NEXT: vmv1r.v v0, v8 +; ZVFH-NEXT: vmfeq.vv v0, v10, v10 ; ZVFH-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFH-NEXT: vfmax.vv v8, v11, v8 ; ZVFH-NEXT: ret @@ -325,9 +316,8 @@ define <2 x half> @vfmax_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 ; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 ; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -345,9 +335,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfadd.vv v10, v9, v9 ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v9, v10, v10 ; ZVFH-NEXT: vmerge.vvm v11, v8, v10, v0 -; ZVFH-NEXT: vmv1r.v v0, v9 +; ZVFH-NEXT: vmfeq.vv v0, v10, v10 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -366,9 +355,8 @@ define <2 x half> @vfmax_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll index 03e0ac42c442c..5f78561224772 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll @@ -52,9 +52,8 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -68,9 +67,8 @@ define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma @@ -124,9 +122,8 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -140,9 +137,8 @@ define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma @@ -198,9 +194,8 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -214,8 +209,8 @@ define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 z ; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14 @@ -274,8 +269,8 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v14 @@ -290,8 +285,8 @@ define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i ; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20 @@ -326,9 +321,8 @@ define <2 x float> @vfmin_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -360,9 +354,8 @@ define <4 x float> @vfmin_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -396,8 +389,8 @@ define <8 x float> @vfmin_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i3 ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -432,8 +425,8 @@ define <16 x float> @vfmin_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -466,9 +459,8 @@ define <2 x double> @vfmin_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -502,8 +494,8 @@ define <4 x double> @vfmin_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -538,8 +530,8 @@ define <8 x double> @vfmin_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -587,8 +579,8 @@ define <16 x double> @vfmin_vv_v16f64_unmasked(<16 x double> %va, <16 x double> ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v24 @@ -767,8 +759,8 @@ define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 ; CHECK-NEXT: vfmin.vv v16, v16, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll index b15d697f0754e..991fe821a8421 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum.ll @@ -15,9 +15,8 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -29,9 +28,8 @@ define <2 x half> @vfmin_v2f16_vv(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -48,9 +46,8 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -62,9 +59,8 @@ define <4 x half> @vfmin_v4f16_vv(<4 x half> %a, <4 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v11 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -81,9 +77,8 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -95,8 +90,8 @@ define <8 x half> @vfmin_v8f16_vv(<8 x half> %a, <8 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v12, v12 -; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmerge.vvm v14, v12, v10, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v10, v10 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v10, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14 @@ -114,8 +109,8 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v14 @@ -128,8 +123,8 @@ define <16 x half> @vfmin_v16f16_vv(<16 x half> %a, <16 x half> %b) { ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma ; ZVFHMIN-NEXT: vmfeq.vv v0, v16, v16 -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v20, v16, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v16, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20 @@ -147,9 +142,8 @@ define <2 x float> @vfmin_v2f32_vv(<2 x float> %a, <2 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -164,9 +158,8 @@ define <4 x float> @vfmin_v4f32_vv(<4 x float> %a, <4 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -181,8 +174,8 @@ define <8 x float> @vfmin_v8f32_vv(<8 x float> %a, <8 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -198,8 +191,8 @@ define <16 x float> @vfmin_v16f32_vv(<16 x float> %a, <16 x float> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -215,9 +208,8 @@ define <2 x double> @vfmin_v2f64_vv(<2 x double> %a, <2 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -232,8 +224,8 @@ define <4 x double> @vfmin_v4f64_vv(<4 x double> %a, <4 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -249,8 +241,8 @@ define <8 x double> @vfmin_v8f64_vv(<8 x double> %a, <8 x double> %b) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -266,8 +258,8 @@ define <16 x double> @vfmin_v16f64_vv(<16 x double> %a, <16 x double> %b) nounwi ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v24 @@ -304,9 +296,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfadd.vv v10, v8, v8 ; ZVFH-NEXT: vmfeq.vv v0, v9, v9 -; ZVFH-NEXT: vmfeq.vv v8, v10, v10 ; ZVFH-NEXT: vmerge.vvm v11, v9, v10, v0 -; ZVFH-NEXT: vmv1r.v v0, v8 +; ZVFH-NEXT: vmfeq.vv v0, v10, v10 ; ZVFH-NEXT: vmerge.vvm v8, v10, v9, v0 ; ZVFH-NEXT: vfmin.vv v8, v11, v8 ; ZVFH-NEXT: ret @@ -325,9 +316,8 @@ define <2 x half> @vfmin_v2f16_vv_nnana(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v9, v9 ; ZVFHMIN-NEXT: vmerge.vvm v10, v11, v9, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v9, v9 ; ZVFHMIN-NEXT: vmerge.vvm v8, v9, v11, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -345,9 +335,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfadd.vv v10, v9, v9 ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v9, v10, v10 ; ZVFH-NEXT: vmerge.vvm v11, v8, v10, v0 -; ZVFH-NEXT: vmv1r.v v0, v9 +; ZVFH-NEXT: vmfeq.vv v0, v10, v10 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -366,9 +355,8 @@ define <2 x half> @vfmin_v2f16_vv_nnanb(<2 x half> %a, <2 x half> %b) { ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v10, v9, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v9, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v10 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, mf4, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll index a1e81ea41c249..07d8c804293a4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll @@ -388,8 +388,8 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmfeq.vf v8, v16, fa5 ; RV32-NEXT: vse64.v v24, (a1), v0.t +; RV32-NEXT: vmfeq.vf v8, v16, fa5 ; RV32-NEXT: addi a0, a1, 128 ; RV32-NEXT: vmv1r.v v0, v8 ; RV32-NEXT: addi a1, sp, 16 @@ -428,8 +428,8 @@ define void @masked_store_v32f64(<32 x double>* %val_ptr, <32 x double>* %a, <32 ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vmfeq.vf v8, v16, fa5 ; RV64-NEXT: vse64.v v24, (a1), v0.t +; RV64-NEXT: vmfeq.vf v8, v16, fa5 ; RV64-NEXT: addi a0, a1, 128 ; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a1, sp, 16 @@ -496,8 +496,8 @@ define void @masked_store_v64f32(<64 x float>* %val_ptr, <64 x float>* %a, <64 x ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: vse32.v v24, (a1), v0.t +; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: addi a1, sp, 16 @@ -545,8 +545,8 @@ define void @masked_store_v128f16(<128 x half>* %val_ptr, <128 x half>* %a, <128 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: vse16.v v24, (a1), v0.t +; CHECK-NEXT: vmfeq.vf v8, v16, fa5 ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: addi a1, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll index 86c28247e97ef..395af121b3c6b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-int.ll @@ -466,8 +466,8 @@ define void @masked_store_v32i64(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; RV64-NEXT: add a0, sp, a0 ; RV64-NEXT: addi a0, a0, 16 ; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV64-NEXT: vmseq.vi v8, v16, 0 ; RV64-NEXT: vse64.v v24, (a1), v0.t +; RV64-NEXT: vmseq.vi v8, v16, 0 ; RV64-NEXT: addi a0, a1, 128 ; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: addi a1, sp, 16 @@ -550,8 +550,8 @@ define void @masked_store_v64i32(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse32.v v24, (a1), v0.t +; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: addi a1, sp, 16 @@ -616,8 +616,8 @@ define void @masked_store_v128i16(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse16.v v24, (a1), v0.t +; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: addi a1, sp, 16 @@ -664,8 +664,8 @@ define void @masked_store_v256i8(ptr %val_ptr, ptr %a, ptr %m_ptr) nounwind { ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: vse8.v v24, (a1), v0.t +; CHECK-NEXT: vmseq.vi v8, v16, 0 ; CHECK-NEXT: addi a0, a1, 128 ; CHECK-NEXT: vmv1r.v v0, v8 ; CHECK-NEXT: addi a1, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index e9e147861df56..9b77f801545da 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1937,8 +1937,8 @@ define float @vreduce_fminimum_v64f32(ptr %x) { ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 @@ -2034,8 +2034,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -2052,8 +2052,8 @@ define float @vreduce_fminimum_v128f32(ptr %x) { ; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -2274,8 +2274,8 @@ define double @vreduce_fminimum_v32f64(ptr %x) { ; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 @@ -2369,8 +2369,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -2387,8 +2387,8 @@ define double @vreduce_fminimum_v64f64(ptr %x) { ; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -2693,8 +2693,8 @@ define float @vreduce_fmaximum_v64f32(ptr %x) { ; CHECK-NEXT: vle32.v v16, (a0) ; CHECK-NEXT: vle32.v v24, (a1) ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 @@ -2790,8 +2790,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -2808,8 +2808,8 @@ define float @vreduce_fmaximum_v128f32(ptr %x) { ; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 @@ -3030,8 +3030,8 @@ define double @vreduce_fmaximum_v32f64(ptr %x) { ; CHECK-NEXT: vle64.v v16, (a0) ; CHECK-NEXT: vle64.v v24, (a1) ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmv1r.v v0, v7 @@ -3125,8 +3125,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmerge.vvm v16, v8, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -3143,8 +3143,8 @@ define double @vreduce_fmaximum_v64f64(ptr %x) { ; CHECK-NEXT: vmfeq.vv v0, v24, v24 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll index 016f95bfef7e7..5e68d8cbb0755 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1379,9 +1379,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a1 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmsne.vi v9, v9, 0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vmv.v.i v9, 1 @@ -1400,9 +1400,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vmv.s.x v9, a1 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmsne.vi v9, v9, 0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vmv.v.i v9, 1 @@ -1427,10 +1427,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vid.v v9 ; RV32-NEXT: vmsltu.vx v9, v9, a1 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1452,10 +1452,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vid.v v9 ; RV64-NEXT: vmsltu.vx v9, v9, a1 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1483,10 +1483,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vid.v v9 ; RV32-NEXT: vmsltu.vx v9, v9, a1 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1510,10 +1510,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vid.v v9 ; RV64-NEXT: vmsltu.vx v9, v9, a1 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1543,10 +1543,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vid.v v10 ; RV32-NEXT: vmsltu.vx v9, v10, a1 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1572,10 +1572,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vid.v v10 ; RV64-NEXT: vmsltu.vx v9, v10, a1 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1607,10 +1607,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vmsltu.vx v9, v12, a1 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1638,10 +1638,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vmsltu.vx v9, v12, a1 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1754,10 +1754,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vle8.v v12, (a2) -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vid.v v16 ; RV32-NEXT: vmsltu.vx v14, v16, a1 ; RV32-NEXT: vsext.vf4 v16, v12 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmsltu.vx v12, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v14, v12, 4 @@ -1798,10 +1798,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle8.v v12, (a2) -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vid.v v16 ; RV64-NEXT: vmsltu.vx v14, v16, a1 ; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmsltu.vx v12, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v14, v12, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll index 05896d8ef6ffd..bf86eb6585503 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-sdnode.ll @@ -23,9 +23,8 @@ define @vfmax_nxv1f16_vv( %a, @vfmax_nxv1f16_vv( %a, @vfmax_nxv2f16_vv( %a, @vfmax_nxv2f16_vv( %a, @vfmax_nxv4f16_vv( %a, @vfmax_nxv4f16_vv( %a, @vfmax_nxv8f16_vv( %a, @vfmax_nxv8f16_vv( %a, @vfmax_nxv16f16_vv( %a, @vfmax_nxv16f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmax_nxv32f16_vv( %a, @vfmax_nxv1f32_vv( %a, @vfmax_nxv2f32_vv( %a, @vfmax_nxv4f32_vv( %a, @vfmax_nxv8f32_vv( %a, @vfmax_nxv16f32_vv( %a, @vfmax_nxv1f64_vv( %a, @vfmax_nxv2f64_vv( %a, @vfmax_nxv4f64_vv( %a, @vfmax_nxv8f64_vv( %a, @vfmax_nxv1f16_vv_nnana( %a, @vfmax_nxv1f16_vv_nnanb( %a, @vfmax_vv_nxv1f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -68,9 +67,8 @@ define @vfmax_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -124,9 +122,8 @@ define @vfmax_vv_nxv2f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -140,9 +137,8 @@ define @vfmax_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -198,9 +194,8 @@ define @vfmax_vv_nxv4f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -214,8 +209,8 @@ define @vfmax_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 ; ZVFHMIN-NEXT: vfmax.vv v10, v8, v14 @@ -274,8 +269,8 @@ define @vfmax_vv_nxv8f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v14 @@ -290,8 +285,8 @@ define @vfmax_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 ; ZVFHMIN-NEXT: vfmax.vv v12, v8, v20 @@ -363,8 +358,8 @@ define @vfmax_vv_nxv16f16_unmasked( %va ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v16, v12, v12 ; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0 +; ZVFH-NEXT: vmfeq.vv v16, v12, v12 ; ZVFH-NEXT: vmv1r.v v0, v16 ; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v20 @@ -385,8 +380,8 @@ define @vfmax_vv_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 +; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv1r.v v0, v7 @@ -586,8 +581,8 @@ define @vfmax_vv_nxv32f16_unmasked( %va ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v7, v16, v16 ; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFH-NEXT: vmfeq.vv v7, v16, v16 ; ZVFH-NEXT: vmv1r.v v0, v7 ; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 ; ZVFH-NEXT: vfmax.vv v8, v8, v24 @@ -677,8 +672,8 @@ define @vfmax_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v3 ; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0 ; ZVFHMIN-NEXT: vfmax.vv v16, v16, v24 @@ -718,9 +713,8 @@ define @vfmax_vv_nxv1f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -752,9 +746,8 @@ define @vfmax_vv_nxv2f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -788,8 +781,8 @@ define @vfmax_vv_nxv4f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -824,8 +817,8 @@ define @vfmax_vv_nxv8f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -858,9 +851,8 @@ define @vfmax_vv_nxv1f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -894,8 +886,8 @@ define @vfmax_vv_nxv2f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v14 @@ -930,8 +922,8 @@ define @vfmax_vv_nxv4f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v20 @@ -979,8 +971,8 @@ define @vfmax_vv_nxv8f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v24 @@ -1189,7 +1181,6 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -1197,6 +1188,7 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: vfmax.vv v8, v16, v8 @@ -1218,8 +1210,8 @@ define @vfmax_vv_nxv16f64_unmasked( ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vfmax.vv v8, v8, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll index e942593924987..9caad8b1f69ea 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-sdnode.ll @@ -23,9 +23,8 @@ define @vfmin_nxv1f16_vv( %a, @vfmin_nxv1f16_vv( %a, @vfmin_nxv2f16_vv( %a, @vfmin_nxv2f16_vv( %a, @vfmin_nxv4f16_vv( %a, @vfmin_nxv4f16_vv( %a, @vfmin_nxv8f16_vv( %a, @vfmin_nxv8f16_vv( %a, @vfmin_nxv16f16_vv( %a, @vfmin_nxv16f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv32f16_vv( %a, @vfmin_nxv1f32_vv( %a, @vfmin_nxv2f32_vv( %a, @vfmin_nxv4f32_vv( %a, @vfmin_nxv8f32_vv( %a, @vfmin_nxv16f32_vv( %a, @vfmin_nxv1f64_vv( %a, @vfmin_nxv2f64_vv( %a, @vfmin_nxv4f64_vv( %a, @vfmin_nxv8f64_vv( %a, @vfmin_nxv1f16_vv_nnana( %a, @vfmin_nxv1f16_vv_nnanb( %a, @vfmin_vv_nxv1f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -68,9 +67,8 @@ define @vfmin_vv_nxv1f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, mf2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv1r.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf4, ta, ma @@ -124,9 +122,8 @@ define @vfmin_vv_nxv2f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv1r.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -140,9 +137,8 @@ define @vfmin_vv_nxv2f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, mf2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v11, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v9, v10, v11, v0 -; ZVFHMIN-NEXT: vmv.v.v v0, v8 +; ZVFHMIN-NEXT: vmfeq.vv v0, v11, v11 ; ZVFHMIN-NEXT: vmerge.vvm v8, v11, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v9, v8, v9 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -198,9 +194,8 @@ define @vfmin_vv_nxv4f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v10, v9, v9 ; ZVFH-NEXT: vmerge.vvm v11, v8, v9, v0 -; ZVFH-NEXT: vmv.v.v v0, v10 +; ZVFH-NEXT: vmfeq.vv v0, v9, v9 ; ZVFH-NEXT: vmerge.vvm v8, v9, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v11 ; ZVFH-NEXT: ret @@ -214,8 +209,8 @@ define @vfmin_vv_nxv4f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v9 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmerge.vvm v14, v10, v12, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v12, v12 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v12, v10, v0 ; ZVFHMIN-NEXT: vfmin.vv v10, v8, v14 @@ -274,8 +269,8 @@ define @vfmin_vv_nxv8f16_unmasked( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmerge.vvm v14, v8, v10, v0 +; ZVFH-NEXT: vmfeq.vv v12, v10, v10 ; ZVFH-NEXT: vmv1r.v v0, v12 ; ZVFH-NEXT: vmerge.vvm v8, v10, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v14 @@ -290,8 +285,8 @@ define @vfmin_vv_nxv8f16_unmasked( %va, < ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m2, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v10 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v20, v12, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v8, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v8 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v12, v0 ; ZVFHMIN-NEXT: vfmin.vv v12, v8, v20 @@ -363,8 +358,8 @@ define @vfmin_vv_nxv16f16_unmasked( %va ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v16, v12, v12 ; ZVFH-NEXT: vmerge.vvm v20, v8, v12, v0 +; ZVFH-NEXT: vmfeq.vv v16, v12, v12 ; ZVFH-NEXT: vmv1r.v v0, v16 ; ZVFH-NEXT: vmerge.vvm v8, v12, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v20 @@ -385,8 +380,8 @@ define @vfmin_vv_nxv16f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: vmerge.vvm v8, v16, v24, v0 +; ZVFHMIN-NEXT: vmfeq.vv v7, v24, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; ZVFHMIN-NEXT: vmv1r.v v0, v7 @@ -586,8 +581,8 @@ define @vfmin_vv_nxv32f16_unmasked( %va ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vmfeq.vv v0, v8, v8 -; ZVFH-NEXT: vmfeq.vv v7, v16, v16 ; ZVFH-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFH-NEXT: vmfeq.vv v7, v16, v16 ; ZVFH-NEXT: vmv1r.v v0, v7 ; ZVFH-NEXT: vmerge.vvm v8, v16, v8, v0 ; ZVFH-NEXT: vfmin.vv v8, v8, v24 @@ -677,8 +672,8 @@ define @vfmin_vv_nxv32f16_unmasked( %va ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 ; ZVFHMIN-NEXT: vmerge.vvm v24, v8, v16, v0 +; ZVFHMIN-NEXT: vmfeq.vv v3, v16, v16 ; ZVFHMIN-NEXT: vmv1r.v v0, v3 ; ZVFHMIN-NEXT: vmerge.vvm v16, v16, v8, v0 ; ZVFHMIN-NEXT: vfmin.vv v16, v16, v24 @@ -718,9 +713,8 @@ define @vfmin_vv_nxv1f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, mf2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -752,9 +746,8 @@ define @vfmin_vv_nxv2f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -788,8 +781,8 @@ define @vfmin_vv_nxv4f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -824,8 +817,8 @@ define @vfmin_vv_nxv8f32_unmasked( %va, ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e32, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -858,9 +851,8 @@ define @vfmin_vv_nxv1f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v10, v9, v9 ; CHECK-NEXT: vmerge.vvm v11, v8, v9, v0 -; CHECK-NEXT: vmv.v.v v0, v10 +; CHECK-NEXT: vmfeq.vv v0, v9, v9 ; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v11 ; CHECK-NEXT: ret @@ -894,8 +886,8 @@ define @vfmin_vv_nxv2f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmerge.vvm v14, v8, v10, v0 +; CHECK-NEXT: vmfeq.vv v12, v10, v10 ; CHECK-NEXT: vmv1r.v v0, v12 ; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v14 @@ -930,8 +922,8 @@ define @vfmin_vv_nxv4f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmerge.vvm v20, v8, v12, v0 +; CHECK-NEXT: vmfeq.vv v16, v12, v12 ; CHECK-NEXT: vmv1r.v v0, v16 ; CHECK-NEXT: vmerge.vvm v8, v12, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v20 @@ -979,8 +971,8 @@ define @vfmin_vv_nxv8f64_unmasked( %v ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v8, v8 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmerge.vvm v24, v8, v16, v0 +; CHECK-NEXT: vmfeq.vv v7, v16, v16 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v24 @@ -1189,7 +1181,6 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: and a3, a4, a3 ; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vl8re64.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -1197,6 +1188,7 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill ; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 +; CHECK-NEXT: vmfeq.vv v7, v24, v24 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 ; CHECK-NEXT: vfmin.vv v8, v16, v8 @@ -1218,8 +1210,8 @@ define @vfmin_vv_nxv16f64_unmasked( ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload -; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmerge.vvm v24, v16, v8, v0 +; CHECK-NEXT: vmfeq.vv v7, v8, v8 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 ; CHECK-NEXT: vfmin.vv v8, v8, v24 diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 1395dc914bb40..6cfc98645f170 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -2361,10 +2361,9 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: and a0, a2, s0 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -2768,10 +2767,9 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s1 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s0 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: and a0, a2, s0 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload @@ -3173,10 +3171,9 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-NOV-NEXT: snez a1, a1 ; CHECK-NOV-NEXT: snez a2, s2 ; CHECK-NOV-NEXT: addi a2, a2, -1 -; CHECK-NOV-NEXT: and a2, a2, s1 ; CHECK-NOV-NEXT: addi a1, a1, -1 ; CHECK-NOV-NEXT: and a1, a1, a0 -; CHECK-NOV-NEXT: mv a0, a2 +; CHECK-NOV-NEXT: and a0, a2, s1 ; CHECK-NOV-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; CHECK-NOV-NEXT: ld s1, 8(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir index b891207341b33..54bdf984e82d9 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir +++ b/llvm/test/CodeGen/RISCV/rvv/mask-reg-alloc.mir @@ -21,7 +21,7 @@ body: | ; CHECK-NEXT: renamable $v0 = COPY killed renamable $v1 ; CHECK-NEXT: renamable $v9 = PseudoVMERGE_VIM_M1 undef renamable $v9, killed renamable $v3, 1, killed renamable $v0, 1, 3 /* e8 */, implicit $vl, implicit $vtype ; CHECK-NEXT: renamable $v0 = PseudoVADD_VV_M1 undef renamable $v0, killed renamable $v8, killed renamable $v9, 1, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype - ; CHECK-NEXT: PseudoRET implicit $v0 + ; CHECK-NEXT: PseudoRET implicit killed $v0 %0:vr = COPY $v0 %1:vr = COPY $v1 %2:vr = COPY $v2 diff --git a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll index 47b88ba71d556..4c5fd3a10a4d5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll +++ b/llvm/test/CodeGen/RISCV/rvv/no-reserved-frame.ll @@ -20,7 +20,9 @@ define signext i32 @foo(i32 signext %aa) #0 { ; CHECK-NEXT: sub sp, sp, a1 ; CHECK-NEXT: andi sp, sp, -16 ; CHECK-NEXT: mv s1, sp -; CHECK-NEXT: lw t0, 44(s1) +; CHECK-NEXT: sw a0, 52(s1) +; CHECK-NEXT: sw a0, 48(s1) +; CHECK-NEXT: lw a0, 44(s1) ; CHECK-NEXT: lw a2, 40(s1) ; CHECK-NEXT: lw a3, 36(s1) ; CHECK-NEXT: lw a4, 32(s1) @@ -30,14 +32,11 @@ define signext i32 @foo(i32 signext %aa) #0 { ; CHECK-NEXT: lw t1, 16(s1) ; CHECK-NEXT: lw a1, 12(s1) ; CHECK-NEXT: lw t2, 8(s1) -; CHECK-NEXT: sw a0, 52(s1) -; CHECK-NEXT: sw a0, 48(s1) ; CHECK-NEXT: addi sp, sp, -32 ; CHECK-NEXT: sd t2, 16(sp) ; CHECK-NEXT: sd a1, 8(sp) ; CHECK-NEXT: addi a1, s1, 48 ; CHECK-NEXT: sd t1, 0(sp) -; CHECK-NEXT: mv a0, t0 ; CHECK-NEXT: call gfunc ; CHECK-NEXT: addi sp, sp, 32 ; CHECK-NEXT: li a0, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index eb02fd895f18d..6300571686013 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -23,10 +23,10 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, mu ; CHECK-NEXT: vadd.vi v12, v11, -16 ; CHECK-NEXT: vrgather.vv v9, v8, v12, v0.t -; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vrgather.vv v13, v10, v12 ; CHECK-NEXT: vadd.vi v10, v11, -15 +; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vrgather.vv v13, v8, v10, v0.t ; CHECK-NEXT: vmsne.vi v8, v13, 0 ; CHECK-NEXT: vmv.v.v v0, v9 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index bcb008857ad32..6a4ebb6b30af2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -97,10 +97,10 @@ define {, } @vector_deinterleave_nxv64i1_nxv ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 +; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v16, 0 ; CHECK-NEXT: vmv1r.v v0, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll index 2e5b67c93fce1..aa0a2e4a52f31 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmfeq.ll @@ -35,9 +35,8 @@ define @intrinsic_vmfeq_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmfeq_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmfeq_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmfeq_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmfeq_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmfeq_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmfge_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmfge_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmfge_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmfge_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmfge_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmfge_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmfgt_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmfgt_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmfgt_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmfgt_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmfgt_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmfgt_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmfle_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmfle_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmfle_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmfle_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmfle_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmfle_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmflt_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmflt_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmflt_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmflt_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmflt_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmflt_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmfne_mask_vv_nxv1f16_nxv1f16( @intrinsic_vmfne_mask_vv_nxv2f16_nxv2f16( @intrinsic_vmfne_mask_vv_nxv4f16_nxv4f16( @intrinsic_vmfne_mask_vv_nxv1f32_nxv1f32( @intrinsic_vmfne_mask_vv_nxv2f32_nxv2f32( @intrinsic_vmfne_mask_vv_nxv1f64_nxv1f64( @intrinsic_vmseq_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmseq_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmseq_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmseq_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmseq_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmseq_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmseq_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmseq_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmseq_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmseq_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsge_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsge_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsge_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsge_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsge_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsge_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsge_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsge_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsge_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsge_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsgeu_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsgeu_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsgeu_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsgeu_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsgeu_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsgeu_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsgeu_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsgeu_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsgeu_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsgeu_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsgt_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsgt_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsgt_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsgt_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsgt_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsgt_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsgt_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsgt_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsgt_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsgt_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsgtu_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsgtu_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsgtu_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsgtu_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsgtu_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsgtu_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsgtu_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsgtu_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsgtu_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsgtu_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsle_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsle_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsle_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsle_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsle_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsle_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsle_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsle_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsle_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsle_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsleu_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsleu_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsleu_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsleu_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsleu_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsleu_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsleu_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsleu_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsleu_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsleu_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmslt_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmslt_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmslt_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmslt_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmslt_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmslt_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmslt_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmslt_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmslt_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmslt_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsltu_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsltu_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsltu_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsltu_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsltu_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsltu_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsltu_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsltu_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsltu_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsltu_mask_vv_nxv1i64_nxv1i64( @intrinsic_vmsne_mask_vv_nxv1i8_nxv1i8( @intrinsic_vmsne_mask_vv_nxv2i8_nxv2i8( @intrinsic_vmsne_mask_vv_nxv4i8_nxv4i8( @intrinsic_vmsne_mask_vv_nxv8i8_nxv8i8( @intrinsic_vmsne_mask_vv_nxv1i16_nxv1i16( @intrinsic_vmsne_mask_vv_nxv2i16_nxv2i16( @intrinsic_vmsne_mask_vv_nxv4i16_nxv4i16( @intrinsic_vmsne_mask_vv_nxv1i32_nxv1i32( @intrinsic_vmsne_mask_vv_nxv2i32_nxv2i32( @intrinsic_vmsne_mask_vv_nxv1i64_nxv1i64( @vselect_combine_regression( %v ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: vmseq.vi v7, v24, 0 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: vle64.v v8, (a0), v0.t +; CHECK-NEXT: vmseq.vi v7, v24, 0 ; CHECK-NEXT: vmv1r.v v0, v7 ; CHECK-NEXT: vle64.v v16, (a1), v0.t ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll index c3b19b59ec3d6..40c3401363dda 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-regression.ll @@ -12,9 +12,8 @@ define i32 @illegal_preserve_vl( %a, %x, pt ; CHECK-NEXT: vsetvli a1, zero, e64, m4, ta, ma ; CHECK-NEXT: vadd.vv v12, v12, v12 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; CHECK-NEXT: vmv.x.s a1, v8 ; CHECK-NEXT: vs4r.v v12, (a0) -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret %index = add %x, %x store %index, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir index 2bac1eeb90609..56c80f33694f5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir @@ -14,7 +14,7 @@ body: | ; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vxrm, implicit $vl, implicit $vtype - ; MIR-NEXT: PseudoRET implicit $v8 + ; MIR-NEXT: PseudoRET implicit killed $v8 ; ASM-LABEL: verify_vxrm: ; ASM: # %bb.0: ; ASM-NEXT: csrwi vxrm, 0 diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll index f61cbfd3ed725..6910f71e7271a 100644 --- a/llvm/test/CodeGen/RISCV/shifts.ll +++ b/llvm/test/CodeGen/RISCV/shifts.ll @@ -64,9 +64,8 @@ define i64 @ashr64(i64 %a, i64 %b) nounwind { ; RV32I-NEXT: sra a1, a1, a2 ; RV32I-NEXT: bltz a4, .LBB2_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a3, a3, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: srai a1, a3, 31 ; RV32I-NEXT: ret ; RV32I-NEXT: .LBB2_2: ; RV32I-NEXT: srl a0, a0, a2 @@ -421,9 +420,8 @@ define i128 @ashr128(i128 %a, i128 %b) nounwind { ; RV64I-NEXT: sra a1, a1, a2 ; RV64I-NEXT: bltz a4, .LBB7_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: srai a3, a3, 63 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: srai a1, a3, 63 ; RV64I-NEXT: ret ; RV64I-NEXT: .LBB7_2: ; RV64I-NEXT: srl a0, a0, a2 diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll index 7fc4713ac2d6e..55ccb4d130c86 100644 --- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll @@ -21,10 +21,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: lh s0, 12(a1) ; RV32I-NEXT: lh s1, 8(a1) ; RV32I-NEXT: lh s2, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lh a0, 0(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, -124 @@ -113,10 +112,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: lh s0, 24(a1) ; RV64I-NEXT: lh s1, 16(a1) ; RV64I-NEXT: lh s2, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lh a0, 0(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, -124 @@ -209,10 +207,9 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: lh s0, 12(a1) ; RV32I-NEXT: lh s1, 8(a1) ; RV32I-NEXT: lh s2, 4(a1) -; RV32I-NEXT: lh a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lh a0, 0(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 @@ -294,10 +291,9 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: lh s0, 24(a1) ; RV64I-NEXT: lh s1, 16(a1) ; RV64I-NEXT: lh s2, 8(a1) -; RV64I-NEXT: lh a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lh a0, 0(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 @@ -775,10 +771,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lh s0, 12(a1) ; RV32I-NEXT: lh s1, 8(a1) -; RV32I-NEXT: lh a2, 4(a1) ; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lh a0, 4(a1) ; RV32I-NEXT: li a1, 654 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __modsi3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 @@ -852,10 +847,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lh s0, 24(a1) ; RV64I-NEXT: lh s1, 16(a1) -; RV64I-NEXT: lh a2, 8(a1) ; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lh a0, 8(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 @@ -1091,11 +1085,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: lw s3, 20(a1) ; RV32I-NEXT: lw s4, 8(a1) ; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __moddi3 ; RV32I-NEXT: mv s7, a0 @@ -1160,11 +1153,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: lw s3, 20(a1) ; RV32IM-NEXT: lw s4, 8(a1) ; RV32IM-NEXT: lw s5, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) ; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __moddi3 ; RV32IM-NEXT: mv s7, a0 @@ -1220,10 +1212,9 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: ld s0, 24(a1) ; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) ; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: ld a0, 8(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __moddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 diff --git a/llvm/test/CodeGen/RISCV/tail-calls.ll b/llvm/test/CodeGen/RISCV/tail-calls.ll index d3e495bb723ad..818f803e2da18 100644 --- a/llvm/test/CodeGen/RISCV/tail-calls.ll +++ b/llvm/test/CodeGen/RISCV/tail-calls.ll @@ -19,11 +19,10 @@ declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1) define void @caller_extern(ptr %src) optsize { ; CHECK-LABEL: caller_extern: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a1, %hi(dest) -; CHECK-NEXT: addi a1, a1, %lo(dest) -; CHECK-NEXT: li a2, 7 ; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: lui a0, %hi(dest) +; CHECK-NEXT: addi a0, a0, %lo(dest) +; CHECK-NEXT: li a2, 7 ; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: tail memcpy entry: @@ -36,11 +35,10 @@ entry: define void @caller_extern_pgso(ptr %src) !prof !14 { ; CHECK-LABEL: caller_extern_pgso: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a1, %hi(dest_pgso) -; CHECK-NEXT: addi a1, a1, %lo(dest_pgso) -; CHECK-NEXT: li a2, 7 ; CHECK-NEXT: mv a3, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: lui a0, %hi(dest_pgso) +; CHECK-NEXT: addi a0, a0, %lo(dest_pgso) +; CHECK-NEXT: li a2, 7 ; CHECK-NEXT: mv a1, a3 ; CHECK-NEXT: tail memcpy entry: diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll index 10497db6edc49..9033bdc966f75 100644 --- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll +++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll @@ -136,9 +136,8 @@ define i64 @load_i64(ptr %p) { ; ; RV32I-FAST-LABEL: load_i64: ; RV32I-FAST: # %bb.0: -; RV32I-FAST-NEXT: lw a2, 0(a0) ; RV32I-FAST-NEXT: lw a1, 4(a0) -; RV32I-FAST-NEXT: mv a0, a2 +; RV32I-FAST-NEXT: lw a0, 0(a0) ; RV32I-FAST-NEXT: ret ; ; RV64I-FAST-LABEL: load_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index c057c656e0fb7..be0ca4609fd19 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -22,10 +22,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV32I-NEXT: lhu s0, 12(a1) ; RV32I-NEXT: lhu s1, 8(a1) ; RV32I-NEXT: lhu s2, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lhu a0, 0(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 124 @@ -101,10 +100,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind { ; RV64I-NEXT: lhu s0, 24(a1) ; RV64I-NEXT: lhu s1, 16(a1) ; RV64I-NEXT: lhu s2, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lhu a0, 0(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 124 @@ -184,10 +182,9 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV32I-NEXT: lhu s0, 12(a1) ; RV32I-NEXT: lhu s1, 8(a1) ; RV32I-NEXT: lhu s2, 4(a1) -; RV32I-NEXT: lhu a2, 0(a1) ; RV32I-NEXT: mv s3, a0 +; RV32I-NEXT: lhu a0, 0(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s4, a0 ; RV32I-NEXT: li a1, 95 @@ -253,10 +250,9 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind { ; RV64I-NEXT: lhu s0, 24(a1) ; RV64I-NEXT: lhu s1, 16(a1) ; RV64I-NEXT: lhu s2, 8(a1) -; RV64I-NEXT: lhu a2, 0(a1) ; RV64I-NEXT: mv s3, a0 +; RV64I-NEXT: lhu a0, 0(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s4, a0 ; RV64I-NEXT: li a1, 95 @@ -536,10 +532,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV32I-NEXT: lhu s1, 8(a1) ; RV32I-NEXT: lhu s2, 4(a1) ; RV32I-NEXT: lhu s3, 0(a1) -; RV32I-NEXT: lhu a2, 12(a1) ; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: lhu a0, 12(a1) ; RV32I-NEXT: li a1, 95 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: andi a1, s3, 63 ; RV32I-NEXT: andi a2, s2, 31 @@ -588,10 +583,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind { ; RV64I-NEXT: lhu s1, 16(a1) ; RV64I-NEXT: lhu s2, 8(a1) ; RV64I-NEXT: lhu s3, 0(a1) -; RV64I-NEXT: lhu a2, 24(a1) ; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: lhu a0, 24(a1) ; RV64I-NEXT: li a1, 95 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: andi a1, s3, 63 ; RV64I-NEXT: andi a2, s2, 31 @@ -644,10 +638,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: lhu s0, 12(a1) ; RV32I-NEXT: lhu s1, 8(a1) -; RV32I-NEXT: lhu a2, 4(a1) ; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lhu a0, 4(a1) ; RV32I-NEXT: li a1, 654 -; RV32I-NEXT: mv a0, a2 ; RV32I-NEXT: call __umodsi3 ; RV32I-NEXT: mv s3, a0 ; RV32I-NEXT: li a1, 23 @@ -710,10 +703,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: lhu s0, 24(a1) ; RV64I-NEXT: lhu s1, 16(a1) -; RV64I-NEXT: lhu a2, 8(a1) ; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lhu a0, 8(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 @@ -799,11 +791,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32I-NEXT: lw s3, 20(a1) ; RV32I-NEXT: lw s4, 8(a1) ; RV32I-NEXT: lw s5, 12(a1) -; RV32I-NEXT: lw a3, 0(a1) -; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: mv s6, a0 +; RV32I-NEXT: lw a0, 0(a1) +; RV32I-NEXT: lw a1, 4(a1) ; RV32I-NEXT: li a2, 1 -; RV32I-NEXT: mv a0, a3 ; RV32I-NEXT: li a3, 0 ; RV32I-NEXT: call __umoddi3 ; RV32I-NEXT: mv s7, a0 @@ -868,11 +859,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: lw s3, 20(a1) ; RV32IM-NEXT: lw s4, 8(a1) ; RV32IM-NEXT: lw s5, 12(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a1, 4(a1) ; RV32IM-NEXT: mv s6, a0 +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a1, 4(a1) ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3 ; RV32IM-NEXT: mv s7, a0 @@ -928,10 +918,9 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: ld s0, 24(a1) ; RV64I-NEXT: ld s1, 16(a1) -; RV64I-NEXT: ld a2, 8(a1) ; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: ld a0, 8(a1) ; RV64I-NEXT: li a1, 654 -; RV64I-NEXT: mv a0, a2 ; RV64I-NEXT: call __umoddi3 ; RV64I-NEXT: mv s3, a0 ; RV64I-NEXT: li a1, 23 diff --git a/llvm/test/CodeGen/RISCV/wide-mem.ll b/llvm/test/CodeGen/RISCV/wide-mem.ll index f98680a9f2dae..05270af824551 100644 --- a/llvm/test/CodeGen/RISCV/wide-mem.ll +++ b/llvm/test/CodeGen/RISCV/wide-mem.ll @@ -7,9 +7,8 @@ define i64 @load_i64(ptr %a) nounwind { ; RV32I-LABEL: load_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: lw a2, 0(a0) ; RV32I-NEXT: lw a1, 4(a0) -; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: lw a0, 0(a0) ; RV32I-NEXT: ret %1 = load i64, ptr %a ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll index b0d435368e92b..8d6a5cd1c38b4 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -560,9 +560,8 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: srai a1, a4, 31 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: ; RV32I-NEXT: lbu a4, 1(a0) @@ -1094,9 +1093,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sra a1, a3, a5 ; RV64I-NEXT: bltz a6, .LBB8_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a4, 31 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: sraiw a1, a4, 31 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a4, 1(a0) diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index a601256bc2afa..4c2ab1230ee6e 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -543,9 +543,8 @@ define void @ashr_8bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sra a1, a3, a5 ; RV32I-NEXT: bltz a6, .LBB5_2 ; RV32I-NEXT: # %bb.1: -; RV32I-NEXT: srai a4, a4, 31 ; RV32I-NEXT: mv a0, a1 -; RV32I-NEXT: mv a1, a4 +; RV32I-NEXT: srai a1, a4, 31 ; RV32I-NEXT: j .LBB5_3 ; RV32I-NEXT: .LBB5_2: ; RV32I-NEXT: lbu a4, 1(a0) @@ -1203,9 +1202,8 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sra a1, a3, a5 ; RV64I-NEXT: bltz a6, .LBB8_2 ; RV64I-NEXT: # %bb.1: -; RV64I-NEXT: sraiw a3, a4, 31 ; RV64I-NEXT: mv a0, a1 -; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: sraiw a1, a4, 31 ; RV64I-NEXT: j .LBB8_3 ; RV64I-NEXT: .LBB8_2: ; RV64I-NEXT: lbu a4, 1(a0) diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index b1efe53290e8e..c99b7e9a759fc 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -866,11 +866,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-NEXT: xor a6, a1, a5 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: and a1, a1, a6 -; RV32-NEXT: slti a1, a1, 0 ; RV32-NEXT: sub a0, a0, a2 ; RV32-NEXT: sw a0, 0(a4) +; RV32-NEXT: slti a0, a1, 0 ; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: ssubo.i64: @@ -890,11 +889,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA-NEXT: xor a6, a1, a5 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: and a1, a1, a6 -; RV32ZBA-NEXT: slti a1, a1, 0 ; RV32ZBA-NEXT: sub a0, a0, a2 ; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: slti a0, a1, 0 ; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: ssubo.i64: @@ -914,11 +912,10 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: xor a6, a1, a5 ; RV32ZICOND-NEXT: xor a1, a1, a3 ; RV32ZICOND-NEXT: and a1, a1, a6 -; RV32ZICOND-NEXT: slti a1, a1, 0 ; RV32ZICOND-NEXT: sub a0, a0, a2 ; RV32ZICOND-NEXT: sw a0, 0(a4) +; RV32ZICOND-NEXT: slti a0, a1, 0 ; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: ssubo.i64: @@ -1367,11 +1364,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: xor a3, s0, a3 ; RV32-NEXT: or a1, a3, a1 -; RV32-NEXT: snez a1, a1 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: sw a0, 0(a4) +; RV32-NEXT: snez a0, a1 ; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 @@ -1438,11 +1434,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: xor a3, s0, a3 ; RV32ZBA-NEXT: or a1, a3, a1 -; RV32ZBA-NEXT: snez a1, a1 ; RV32ZBA-NEXT: mul a0, a0, a2 ; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: snez a0, a1 ; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 @@ -1509,11 +1504,10 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: xor a1, a1, a3 ; RV32ZICOND-NEXT: xor a3, s0, a3 ; RV32ZICOND-NEXT: or a1, a3, a1 -; RV32ZICOND-NEXT: snez a1, a1 ; RV32ZICOND-NEXT: mul a0, a0, a2 ; RV32ZICOND-NEXT: sw a0, 0(a4) +; RV32ZICOND-NEXT: snez a0, a1 ; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32ZICOND-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32ZICOND-NEXT: addi sp, sp, 16 @@ -1556,11 +1550,10 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32-NEXT: add a1, a1, a5 ; RV32-NEXT: xor a1, a1, a7 ; RV32-NEXT: or a1, t0, a1 -; RV32-NEXT: snez a1, a1 ; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: snez a0, a1 ; RV32-NEXT: sw a4, 4(a2) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: smulo2.i64: @@ -1595,12 +1588,11 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32ZBA-NEXT: add a1, a1, a5 ; RV32ZBA-NEXT: xor a1, a1, a7 ; RV32ZBA-NEXT: or a1, t0, a1 -; RV32ZBA-NEXT: snez a1, a1 ; RV32ZBA-NEXT: sh1add a3, a0, a0 ; RV32ZBA-NEXT: sh2add a0, a3, a0 ; RV32ZBA-NEXT: sw a0, 0(a2) +; RV32ZBA-NEXT: snez a0, a1 ; RV32ZBA-NEXT: sw a4, 4(a2) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: smulo2.i64: @@ -1634,11 +1626,10 @@ define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { ; RV32ZICOND-NEXT: add a1, a1, a5 ; RV32ZICOND-NEXT: xor a1, a1, a7 ; RV32ZICOND-NEXT: or a1, t0, a1 -; RV32ZICOND-NEXT: snez a1, a1 ; RV32ZICOND-NEXT: mul a0, a0, a3 ; RV32ZICOND-NEXT: sw a0, 0(a2) +; RV32ZICOND-NEXT: snez a0, a1 ; RV32ZICOND-NEXT: sw a4, 4(a2) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: smulo2.i64: @@ -1663,10 +1654,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; RV32-LABEL: umulo.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: mulhu a3, a0, a1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: sw a0, 0(a2) -; RV32-NEXT: mv a0, a3 +; RV32-NEXT: snez a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.i32: @@ -1682,10 +1672,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; RV32ZBA-LABEL: umulo.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mulhu a3, a0, a1 -; RV32ZBA-NEXT: snez a3, a3 ; RV32ZBA-NEXT: mul a0, a0, a1 ; RV32ZBA-NEXT: sw a0, 0(a2) -; RV32ZBA-NEXT: mv a0, a3 +; RV32ZBA-NEXT: snez a0, a3 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.i32: @@ -1701,10 +1690,9 @@ define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { ; RV32ZICOND-LABEL: umulo.i32: ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: mulhu a3, a0, a1 -; RV32ZICOND-NEXT: snez a3, a3 ; RV32ZICOND-NEXT: mul a0, a0, a1 ; RV32ZICOND-NEXT: sw a0, 0(a2) -; RV32ZICOND-NEXT: mv a0, a3 +; RV32ZICOND-NEXT: snez a0, a3 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo.i32: @@ -1729,10 +1717,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { ; RV32: # %bb.0: # %entry ; RV32-NEXT: li a3, 13 ; RV32-NEXT: mulhu a2, a0, a3 -; RV32-NEXT: snez a2, a2 ; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: sw a0, 0(a1) -; RV32-NEXT: mv a0, a2 +; RV32-NEXT: snez a0, a2 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo2.i32: @@ -1750,11 +1737,10 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: li a2, 13 ; RV32ZBA-NEXT: mulhu a2, a0, a2 -; RV32ZBA-NEXT: snez a2, a2 ; RV32ZBA-NEXT: sh1add a3, a0, a0 ; RV32ZBA-NEXT: sh2add a0, a3, a0 ; RV32ZBA-NEXT: sw a0, 0(a1) -; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: snez a0, a2 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo2.i32: @@ -1771,10 +1757,9 @@ define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { ; RV32ZICOND: # %bb.0: # %entry ; RV32ZICOND-NEXT: li a3, 13 ; RV32ZICOND-NEXT: mulhu a2, a0, a3 -; RV32ZICOND-NEXT: snez a2, a2 ; RV32ZICOND-NEXT: mul a0, a0, a3 ; RV32ZICOND-NEXT: sw a0, 0(a1) -; RV32ZICOND-NEXT: mv a0, a2 +; RV32ZICOND-NEXT: snez a0, a2 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo2.i32: @@ -1882,20 +1867,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32-NEXT: mulhu a3, a3, a0 ; RV32-NEXT: snez a3, a3 ; RV32-NEXT: or a1, a1, a3 -; RV32-NEXT: or a1, a1, a6 ; RV32-NEXT: mul a0, a0, a2 ; RV32-NEXT: sw a0, 0(a4) +; RV32-NEXT: or a0, a1, a6 ; RV32-NEXT: sw a5, 4(a4) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: mulhu a3, a0, a1 -; RV64-NEXT: snez a3, a3 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: sd a0, 0(a2) -; RV64-NEXT: mv a0, a3 +; RV64-NEXT: snez a0, a3 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo.i64: @@ -1915,20 +1898,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZBA-NEXT: mulhu a3, a3, a0 ; RV32ZBA-NEXT: snez a3, a3 ; RV32ZBA-NEXT: or a1, a1, a3 -; RV32ZBA-NEXT: or a1, a1, a6 ; RV32ZBA-NEXT: mul a0, a0, a2 ; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: or a0, a1, a6 ; RV32ZBA-NEXT: sw a5, 4(a4) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: mulhu a3, a0, a1 -; RV64ZBA-NEXT: snez a3, a3 ; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: sd a0, 0(a2) -; RV64ZBA-NEXT: mv a0, a3 +; RV64ZBA-NEXT: snez a0, a3 ; RV64ZBA-NEXT: ret ; ; RV32ZICOND-LABEL: umulo.i64: @@ -1948,20 +1929,18 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { ; RV32ZICOND-NEXT: mulhu a3, a3, a0 ; RV32ZICOND-NEXT: snez a3, a3 ; RV32ZICOND-NEXT: or a1, a1, a3 -; RV32ZICOND-NEXT: or a1, a1, a6 ; RV32ZICOND-NEXT: mul a0, a0, a2 ; RV32ZICOND-NEXT: sw a0, 0(a4) +; RV32ZICOND-NEXT: or a0, a1, a6 ; RV32ZICOND-NEXT: sw a5, 4(a4) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo.i64: ; RV64ZICOND: # %bb.0: # %entry ; RV64ZICOND-NEXT: mulhu a3, a0, a1 -; RV64ZICOND-NEXT: snez a3, a3 ; RV64ZICOND-NEXT: mul a0, a0, a1 ; RV64ZICOND-NEXT: sd a0, 0(a2) -; RV64ZICOND-NEXT: mv a0, a3 +; RV64ZICOND-NEXT: snez a0, a3 ; RV64ZICOND-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) @@ -1981,21 +1960,19 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32-NEXT: sltu a5, a4, a5 ; RV32-NEXT: mulhu a1, a1, a3 ; RV32-NEXT: snez a1, a1 -; RV32-NEXT: or a1, a1, a5 ; RV32-NEXT: mul a0, a0, a3 ; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: or a0, a1, a5 ; RV32-NEXT: sw a4, 4(a2) -; RV32-NEXT: mv a0, a1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo2.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: li a3, 13 ; RV64-NEXT: mulhu a2, a0, a3 -; RV64-NEXT: snez a2, a2 ; RV64-NEXT: mul a0, a0, a3 ; RV64-NEXT: sd a0, 0(a1) -; RV64-NEXT: mv a0, a2 +; RV64-NEXT: snez a0, a2 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo2.i64: @@ -2008,23 +1985,21 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32ZBA-NEXT: sltu a4, a5, a4 ; RV32ZBA-NEXT: mulhu a1, a1, a3 ; RV32ZBA-NEXT: snez a1, a1 -; RV32ZBA-NEXT: or a1, a1, a4 ; RV32ZBA-NEXT: sh1add a3, a0, a0 ; RV32ZBA-NEXT: sh2add a0, a3, a0 ; RV32ZBA-NEXT: sw a0, 0(a2) +; RV32ZBA-NEXT: or a0, a1, a4 ; RV32ZBA-NEXT: sw a5, 4(a2) -; RV32ZBA-NEXT: mv a0, a1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo2.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: li a2, 13 ; RV64ZBA-NEXT: mulhu a2, a0, a2 -; RV64ZBA-NEXT: snez a2, a2 ; RV64ZBA-NEXT: sh1add a3, a0, a0 ; RV64ZBA-NEXT: sh2add a0, a3, a0 ; RV64ZBA-NEXT: sd a0, 0(a1) -; RV64ZBA-NEXT: mv a0, a2 +; RV64ZBA-NEXT: snez a0, a2 ; RV64ZBA-NEXT: ret ; ; RV32ZICOND-LABEL: umulo2.i64: @@ -2036,21 +2011,19 @@ define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { ; RV32ZICOND-NEXT: sltu a5, a4, a5 ; RV32ZICOND-NEXT: mulhu a1, a1, a3 ; RV32ZICOND-NEXT: snez a1, a1 -; RV32ZICOND-NEXT: or a1, a1, a5 ; RV32ZICOND-NEXT: mul a0, a0, a3 ; RV32ZICOND-NEXT: sw a0, 0(a2) +; RV32ZICOND-NEXT: or a0, a1, a5 ; RV32ZICOND-NEXT: sw a4, 4(a2) -; RV32ZICOND-NEXT: mv a0, a1 ; RV32ZICOND-NEXT: ret ; ; RV64ZICOND-LABEL: umulo2.i64: ; RV64ZICOND: # %bb.0: # %entry ; RV64ZICOND-NEXT: li a3, 13 ; RV64ZICOND-NEXT: mulhu a2, a0, a3 -; RV64ZICOND-NEXT: snez a2, a2 ; RV64ZICOND-NEXT: mul a0, a0, a3 ; RV64ZICOND-NEXT: sd a0, 0(a1) -; RV64ZICOND-NEXT: mv a0, a2 +; RV64ZICOND-NEXT: snez a0, a2 ; RV64ZICOND-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13) diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index 46aa383866e93..bbf1df20aeda5 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -455,12 +455,12 @@ define ptr @swib(ptr %base, i32 %a, i32 %b) { define ptr @sdia(ptr %base, i64 %a, i64 %b) { ; RV32XTHEADMEMIDX-LABEL: sdia: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: addi a5, a0, 64 ; RV32XTHEADMEMIDX-NEXT: add a2, a2, a4 ; RV32XTHEADMEMIDX-NEXT: add a3, a1, a3 ; RV32XTHEADMEMIDX-NEXT: sltu a1, a3, a1 ; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 ; RV32XTHEADMEMIDX-NEXT: sw a3, 0(a0) +; RV32XTHEADMEMIDX-NEXT: addi a5, a0, 64 ; RV32XTHEADMEMIDX-NEXT: sw a1, 4(a0) ; RV32XTHEADMEMIDX-NEXT: mv a0, a5 ; RV32XTHEADMEMIDX-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir index 93931ff950a8c..81ca7ca9bb8bd 100644 --- a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir +++ b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir @@ -19,7 +19,7 @@ body: | ; CHECK-ZCMP32-LABEL: name: popret_rvlist5 ; CHECK-ZCMP32: liveins: $x1, $x8 ; CHECK-ZCMP32-NEXT: {{ $}} - ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8 + ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4 @@ -41,7 +41,7 @@ body: | ; CHECK-ZCMP64-LABEL: name: popret_rvlist5 ; CHECK-ZCMP64: liveins: $x1, $x8 ; CHECK-ZCMP64-NEXT: {{ $}} - ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8 + ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -16 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8 @@ -103,7 +103,7 @@ body: | ; CHECK-ZCMP32-LABEL: name: popretz_rvlist5 ; CHECK-ZCMP32: liveins: $x1, $x8 ; CHECK-ZCMP32-NEXT: {{ $}} - ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8 + ; CHECK-ZCMP32-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8 ; CHECK-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4 @@ -121,12 +121,12 @@ body: | ; CHECK-LIBCALL32-NEXT: $x1 = IMPLICIT_DEF ; CHECK-LIBCALL32-NEXT: $x8 = IMPLICIT_DEF ; CHECK-LIBCALL32-NEXT: $x10 = ADDI $x0, 0 - ; CHECK-LIBCALL32-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit $x10 + ; CHECK-LIBCALL32-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit killed $x10 ; ; CHECK-ZCMP64-LABEL: name: popretz_rvlist5 ; CHECK-ZCMP64: liveins: $x1, $x8 ; CHECK-ZCMP64-NEXT: {{ $}} - ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit $x1, implicit $x8 + ; CHECK-ZCMP64-NEXT: frame-setup CM_PUSH 5, 0, implicit-def $x2, implicit $x2, implicit killed $x1, implicit killed $x8 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -16 ; CHECK-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8 @@ -144,7 +144,7 @@ body: | ; CHECK-LIBCALL64-NEXT: $x1 = IMPLICIT_DEF ; CHECK-LIBCALL64-NEXT: $x8 = IMPLICIT_DEF ; CHECK-LIBCALL64-NEXT: $x10 = ADDI $x0, 0 - ; CHECK-LIBCALL64-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit $x10 + ; CHECK-LIBCALL64-NEXT: frame-destroy PseudoTAIL target-flags(riscv-call) &__riscv_restore_1, implicit $x2, implicit killed $x10 ; ; CHECK-NO-ZCMP32-LABEL: name: popretz_rvlist5 ; CHECK-NO-ZCMP32: liveins: $x1, $x8 @@ -161,7 +161,7 @@ body: | ; CHECK-NO-ZCMP32-NEXT: $x1 = LW $x2, 12 :: (load (s32) from %stack.0) ; CHECK-NO-ZCMP32-NEXT: $x8 = LW $x2, 8 :: (load (s32) from %stack.1) ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-destroy ADDI $x2, 16 - ; CHECK-NO-ZCMP32-NEXT: PseudoRET implicit $x10 + ; CHECK-NO-ZCMP32-NEXT: PseudoRET implicit killed $x10 ; ; CHECK-NO-ZCMP64-LABEL: name: popretz_rvlist5 ; CHECK-NO-ZCMP64: liveins: $x1, $x8 @@ -178,7 +178,7 @@ body: | ; CHECK-NO-ZCMP64-NEXT: $x1 = LD $x2, 8 :: (load (s64) from %stack.0) ; CHECK-NO-ZCMP64-NEXT: $x8 = LD $x2, 0 :: (load (s64) from %stack.1) ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-destroy ADDI $x2, 16 - ; CHECK-NO-ZCMP64-NEXT: PseudoRET implicit $x10 + ; CHECK-NO-ZCMP64-NEXT: PseudoRET implicit killed $x10 $x1 = IMPLICIT_DEF $x8 = IMPLICIT_DEF $x10 = COPY $x0 diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll index f8557419c4199..abb7fff831afe 100644 --- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll +++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll @@ -207,11 +207,10 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; ARM-NEXT: .save {r4, lr} ; ARM-NEXT: push {r4, lr} ; ARM-NEXT: lsls r0, r0, #28 -; ARM-NEXT: asrs r4, r0, #31 ; ARM-NEXT: lsls r1, r1, #28 ; ARM-NEXT: asrs r2, r1, #28 ; ARM-NEXT: asrs r3, r1, #31 -; ARM-NEXT: mov r1, r4 +; ARM-NEXT: asrs r1, r0, #31 ; ARM-NEXT: bl __aeabi_lmul ; ARM-NEXT: cmp r1, #1 ; ARM-NEXT: bgt .LBB2_2 @@ -387,11 +386,10 @@ define i4 @func6(i4 %x, i4 %y) nounwind { ; ARM-NEXT: .save {r4, lr} ; ARM-NEXT: push {r4, lr} ; ARM-NEXT: lsls r0, r0, #28 -; ARM-NEXT: asrs r4, r0, #31 ; ARM-NEXT: lsls r1, r1, #28 ; ARM-NEXT: asrs r2, r1, #28 ; ARM-NEXT: asrs r3, r1, #31 -; ARM-NEXT: mov r1, r4 +; ARM-NEXT: asrs r1, r0, #31 ; ARM-NEXT: bl __aeabi_lmul ; ARM-NEXT: cmp r1, #0 ; ARM-NEXT: bmi .LBB5_2 diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll index 9b5fa1c2bc811..b0cc1c6886298 100644 --- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll @@ -9,10 +9,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: .pad #60 ; THUMBV6-NEXT: sub sp, #60 ; THUMBV6-NEXT: mov r6, r3 -; THUMBV6-NEXT: mov r1, r2 -; THUMBV6-NEXT: str r2, [sp, #52] @ 4-byte Spill ; THUMBV6-NEXT: mov r4, r0 ; THUMBV6-NEXT: str r0, [sp, #40] @ 4-byte Spill +; THUMBV6-NEXT: mov r1, r2 +; THUMBV6-NEXT: str r2, [sp, #52] @ 4-byte Spill ; THUMBV6-NEXT: ldr r2, [sp, #88] ; THUMBV6-NEXT: str r2, [sp, #48] @ 4-byte Spill ; THUMBV6-NEXT: movs r5, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index 939ab71a8061c..8b8ed524240c2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -750,11 +750,10 @@ define arm_aapcs_vfpcc <4 x float> @frem_f32(<4 x float> %in1, <4 x float> %in2) ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl fmodf -; CHECK-NEXT: vmov r4, r2, d10 -; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d10 +; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s18, r6 -; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl fmodf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -885,11 +884,9 @@ define arm_aapcs_vfpcc <2 x double> @fdiv_f64(<2 x double> %in1, <2 x double> %i ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_ddiv -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_ddiv ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -912,11 +909,9 @@ define arm_aapcs_vfpcc <2 x double> @frem_f64(<2 x double> %in1, <2 x double> %i ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl fmod -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl fmod ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index d747da76a45fa..da3c0d66f2209 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -52,10 +52,8 @@ define arm_aapcs_vfpcc <2 x double> @sqrt_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl sqrt -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl sqrt ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -79,10 +77,9 @@ define arm_aapcs_vfpcc <4 x float> @cos_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -109,8 +106,7 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -163,10 +159,8 @@ define arm_aapcs_vfpcc <2 x double> @cos_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl cos -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl cos ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -190,10 +184,9 @@ define arm_aapcs_vfpcc <4 x float> @sin_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl sinf -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -220,8 +213,7 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl sinf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -274,10 +266,8 @@ define arm_aapcs_vfpcc <2 x double> @sin_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl sin -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl sin ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -301,10 +291,9 @@ define arm_aapcs_vfpcc <4 x float> @tan_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl tanf -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl tanf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -331,8 +320,7 @@ define arm_aapcs_vfpcc <8 x half> @tan_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl tanf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl tanf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -385,10 +373,8 @@ define arm_aapcs_vfpcc <2 x double> @tan_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl tan -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl tan ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -412,10 +398,9 @@ define arm_aapcs_vfpcc <4 x float> @exp_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl expf -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -442,8 +427,7 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl expf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl expf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -496,10 +480,8 @@ define arm_aapcs_vfpcc <2 x double> @exp_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl exp -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl exp ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -523,10 +505,9 @@ define arm_aapcs_vfpcc <4 x float> @exp2_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl exp2f -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -553,8 +534,7 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl exp2f ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -607,10 +587,8 @@ define arm_aapcs_vfpcc <2 x double> @exp2_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl exp2 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl exp2 ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -634,10 +612,9 @@ define arm_aapcs_vfpcc <4 x float> @log_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl logf -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -664,8 +641,7 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl logf ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl logf ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -718,10 +694,8 @@ define arm_aapcs_vfpcc <2 x double> @log_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl log -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl log ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -745,10 +719,9 @@ define arm_aapcs_vfpcc <4 x float> @log2_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log2f -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -775,8 +748,7 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log2f ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -829,10 +801,8 @@ define arm_aapcs_vfpcc <2 x double> @log2_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl log2 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl log2 ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -856,10 +826,9 @@ define arm_aapcs_vfpcc <4 x float> @log10_float32_t(<4 x float> %src) { ; CHECK-NEXT: mov r5, r0 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl log10f -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d8 ; CHECK-NEXT: vmov s18, r5 -; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -886,8 +855,7 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) { ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vcvtt.f32.f16 s0, s16 ; CHECK-NEXT: vmov s16, r0 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: bl log10f ; CHECK-NEXT: vmov s0, r0 ; CHECK-NEXT: vcvtb.f16.f32 s20, s16 @@ -940,10 +908,8 @@ define arm_aapcs_vfpcc <2 x double> @log10_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl log10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl log10 ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -970,11 +936,10 @@ define arm_aapcs_vfpcc <4 x float> @pow_float32_t(<4 x float> %src1, <4 x float> ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl powf -; CHECK-NEXT: vmov r4, r2, d10 -; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s19, r0 +; CHECK-NEXT: vmov r4, r0, d10 +; CHECK-NEXT: vmov r5, r1, d8 ; CHECK-NEXT: vmov s18, r6 -; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl powf ; CHECK-NEXT: vmov s17, r0 ; CHECK-NEXT: mov r0, r4 @@ -1075,11 +1040,9 @@ define arm_aapcs_vfpcc <2 x double> @pow_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl pow -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl pow ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll index f2ac526892180..de4b24da27a8d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -622,23 +622,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vcvt.s32.f16 s10, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vcvt.s32.f16 s5, s3 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vcvt.s32.f16 s5, s3 ; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 +; CHECK-NEXT: vcvt.s32.f16 s8, s1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vcvt.s32.f16 s8, s1 +; CHECK-NEXT: vcvt.s32.f16 s14, s0 +; CHECK-NEXT: vcvt.s32.f16 s12, s0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 @@ -1704,23 +1704,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vcvt.s32.f16 s10, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vcvt.s32.f16 s5, s3 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmovx.f16 s0, s2 +; CHECK-NEXT: vcvt.s32.f16 s5, s3 ; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 +; CHECK-NEXT: vcvt.s32.f16 s8, s1 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vcvt.s32.f16 s8, s1 +; CHECK-NEXT: vcvt.s32.f16 s14, s0 +; CHECK-NEXT: vcvt.s32.f16 s12, s0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll index 81b6a6940a7d6..aa8c618b41274 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -1379,12 +1379,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) { ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vcvt.s32.f16 s8, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 @@ -1404,11 +1404,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s10, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 ; CHECK-NEXT: vcvt.s32.f16 s4, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vcvt.s32.f16 s8, s8 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 ; CHECK-NEXT: vmov r1, s6 @@ -1431,11 +1431,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s12, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 ; CHECK-NEXT: vcvt.s32.f16 s4, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vcvt.s32.f16 s8, s8 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vcvt.s32.f16 s10, s10 +; CHECK-NEXT: vcvt.s32.f16 s8, s8 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvt.s32.f16 s6, s3 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 @@ -1465,9 +1465,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32(<8 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-NEXT: vcvt.s32.f16 s2, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.s32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -4823,9 +4823,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> % ; CHECK-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-NEXT: vcvt.s32.f16 s2, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.s32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -5233,14 +5233,13 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r6, #0 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcvtt.f32.f16 s19, s17 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r1, s19 ; CHECK-NEXT: vcmp.f32 s24, s30 +; CHECK-NEXT: vcvtt.f32.f16 s19, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s24, s28 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 @@ -5249,7 +5248,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r6 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcvtb.f32.f16 s17, s17 ; CHECK-NEXT: mov r6, r0 @@ -5293,14 +5292,13 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: movvs r7, #0 ; CHECK-NEXT: vmov q5[3], q5[1], r7, r5 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vcvtt.f32.f16 s16, s18 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: vcmp.f32 s17, s30 +; CHECK-NEXT: vcvtt.f32.f16 s16, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s17, s28 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 @@ -5309,7 +5307,7 @@ define arm_aapcs_vfpcc <8 x i64> @test_signed_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r6 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: vcvtb.f32.f16 s18, s18 ; CHECK-NEXT: mov r6, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll index 5ab184a066e49..1849341ce72b7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -1145,12 +1145,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) { ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vcvt.u32.f16 s8, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vcvt.u32.f16 s4, s4 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vcvt.u32.f16 s4, s4 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 @@ -1170,11 +1170,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s10, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 ; CHECK-NEXT: vcvt.u32.f16 s4, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vcvt.u32.f16 s8, s8 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 ; CHECK-NEXT: vmov r1, s6 @@ -1197,11 +1197,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s12, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 ; CHECK-NEXT: vcvt.u32.f16 s4, s2 -; CHECK-NEXT: vmovx.f16 s2, s2 -; CHECK-NEXT: vcvt.u32.f16 s8, s8 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vcvt.u32.f16 s10, s10 +; CHECK-NEXT: vcvt.u32.f16 s8, s8 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvt.u32.f16 s6, s3 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 @@ -1231,9 +1231,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32(<8 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s14, s2 ; CHECK-NEXT: vcvt.u32.f16 s2, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vcvt.u32.f16 s4, s4 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vcvt.u32.f16 s4, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.u32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -1769,8 +1769,8 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: str.w r0, [r8, #25] -; CHECK-NEXT: vmov r7, s17 ; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r7, s17 ; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s16, #0 @@ -3735,9 +3735,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32_duplicate(<8 x half> ; CHECK-NEXT: vcvt.u32.f16 s14, s2 ; CHECK-NEXT: vcvt.u32.f16 s2, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vcvt.u32.f16 s4, s4 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vcvt.u32.f16 s4, s4 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.u32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -4069,19 +4069,18 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r6, #-1 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcvtt.f32.f16 s30, s17 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r1, s30 ; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: vcvtt.f32.f16 s30, s17 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s16, s28 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vmov q5[2], q5[0], r0, r6 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s30 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s30, #0 ; CHECK-NEXT: mov r6, r0 @@ -4113,19 +4112,18 @@ define arm_aapcs_vfpcc <8 x i64> @test_unsigned_v8f16_v8i64(<8 x half> %f) { ; CHECK-NEXT: mov r4, r1 ; CHECK-NEXT: vmov q5[3], q5[1], r7, r5 ; CHECK-NEXT: bl __aeabi_f2ulz -; CHECK-NEXT: vcvtt.f32.f16 s17, s18 -; CHECK-NEXT: mov r7, r1 -; CHECK-NEXT: vmov r1, s17 ; CHECK-NEXT: vcmp.f32 s16, #0 +; CHECK-NEXT: vcvtt.f32.f16 s17, s18 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r0, #0 ; CHECK-NEXT: vcmp.f32 s16, s28 +; CHECK-NEXT: mov r7, r1 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: vmov q6[2], q6[0], r0, r6 -; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov r0, s17 ; CHECK-NEXT: bl __aeabi_f2ulz ; CHECK-NEXT: vcmp.f32 s17, #0 ; CHECK-NEXT: mov r6, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll index 1d7dcc8bf8440..b836a4b7014c7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-frint.ll +++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll @@ -60,10 +60,8 @@ define arm_aapcs_vfpcc <2 x double> @fceil_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl ceil -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl ceil ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -132,10 +130,8 @@ define arm_aapcs_vfpcc <2 x double> @ftrunc_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl trunc -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl trunc ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -204,10 +200,8 @@ define arm_aapcs_vfpcc <2 x double> @frint_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl rint -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl rint ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -266,10 +260,8 @@ define arm_aapcs_vfpcc <2 x double> @fnearbyint_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl nearbyint -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl nearbyint ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -338,10 +330,8 @@ define arm_aapcs_vfpcc <2 x double> @ffloor_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl floor -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl floor ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -410,10 +400,8 @@ define arm_aapcs_vfpcc <2 x double> @fround_float64_t(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl round -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl round ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index fe28f785623ed..e67b2fe32b7e2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-LABEL: ext_add_trunc_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s2 -; CHECK-NEXT: vmov.f32 s2, s3 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f32 s8, s3 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.f32 s2, s5 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add.w r12, r1, r0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r0, s0 diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll index 05f438acc3a7e..e00908dd037bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll @@ -64,22 +64,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: asrs r1, r2, #31 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov.f32 s2, s21 ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: asrs r1, r2, #31 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov r2, s20 ; CHECK-NEXT: vmov d11, r0, r1 -; CHECK-NEXT: asrs r3, r2, #31 +; CHECK-NEXT: asrs r1, r2, #31 ; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q1, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index f4643f8c6c4a1..aaee97318ecd0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -7,10 +7,10 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle1_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -30,10 +30,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle3_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -923,10 +923,10 @@ entry: define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) { ; CHECK-LABEL: shuffle2_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -948,10 +948,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) { ; CHECK-LABEL: shuffle1_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -971,10 +971,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) { ; CHECK-LABEL: shuffle3_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s2 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -1383,10 +1383,10 @@ entry: define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) { ; CHECK-LABEL: shuffle2_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll index 6ce7550014296..de9328a3c2423 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -7,10 +7,10 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -21,10 +21,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -69,10 +69,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -83,10 +83,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -120,10 +120,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -213,10 +213,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -227,10 +227,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -241,10 +241,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -261,9 +261,9 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <1 ; CHECK-NEXT: vmov.8 q1[9], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.8 q1[11], r0 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s5, s2 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -278,10 +278,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s6, s0 -; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s4, s2 ; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s5, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -292,10 +292,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -340,10 +340,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -354,10 +354,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s6, s1 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll index b3f7b7d961ad0..bf921d2602f24 100644 --- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll +++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll @@ -117,11 +117,9 @@ define arm_aapcs_vfpcc <2 x double> @add_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dadd -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_dadd ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -248,11 +246,9 @@ define arm_aapcs_vfpcc <2 x double> @sub_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dsub -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_dsub ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -381,11 +377,9 @@ define arm_aapcs_vfpcc <2 x double> @mul_float64_t(<2 x double> %src1, <2 x doub ; CHECK-NEXT: vmov r0, r1, d11 ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: bl __aeabi_dmul -; CHECK-NEXT: vmov lr, r12, d10 -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, lr -; CHECK-NEXT: mov r1, r12 +; CHECK-NEXT: vmov r0, r1, d10 +; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: bl __aeabi_dmul ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index 1279714b5a78c..04dcc77e9f937 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -374,17 +374,17 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: vmov.f32 s6, s7 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: asr.w r12, r3, #31 ; CHECK-NEXT: subs.w r8, r3, r4 +; CHECK-NEXT: asr.w r12, r3, #31 ; CHECK-NEXT: sbc.w r12, r12, r4, asr #31 ; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: vmov.f32 s10, s9 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index ad7a09fa50acb..368884802a315 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -266,10 +266,8 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_int64(<2 x i64> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_l2d -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_l2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -290,10 +288,8 @@ define arm_aapcs_vfpcc <2 x double> @foo_float_uint64(<2 x i64> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d8 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll index a5725a2a30048..82db1a95037a9 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -18,10 +18,10 @@ entry: define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { ; CHECK-LABEL: fpext_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtt.f32.f16 s11, s1 -; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtt.f32.f16 s9, s0 +; CHECK-NEXT: vcvtb.f32.f16 s10, s1 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 +; CHECK-NEXT: vcvtt.f32.f16 s11, s1 ; CHECK-NEXT: vcvtt.f32.f16 s7, s3 ; CHECK-NEXT: vcvtb.f32.f16 s6, s3 ; CHECK-NEXT: vcvtt.f32.f16 s5, s2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index b49f19e55c895..271beac139288 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -10,17 +10,15 @@ define void @vld4_v2i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov.f32 s8, s3 ; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 -; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r2, r3 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add.w r12, r2, r0 ; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s12 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index b005cb92dc516..8e6e0191e2670 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -330,8 +330,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vmov.f32 s5, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; @@ -420,8 +420,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index b36904495e878..176246427e64c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -131,8 +131,8 @@ define void @vst4_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224] @@ -897,8 +897,8 @@ define void @vst4_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #160] ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224] diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll index 0d7354fcbbf2c..bad1bf15c145f 100644 --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -62,23 +62,19 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32 ; CHECK-NEXT: vand q6, q0, q5 ; CHECK-NEXT: vmov r0, r1, d13 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vmov d9, r0, r1 -; CHECK-NEXT: vand q5, q0, q5 +; CHECK-NEXT: vmov r0, r1, d12 ; CHECK-NEXT: vmov r4, r5, d11 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vand q5, q0, q5 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: mov r1, r5 ; CHECK-NEXT: bl __aeabi_ul2d -; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: vmov d11, r0, r1 -; CHECK-NEXT: mov r0, r2 -; CHECK-NEXT: mov r1, r3 +; CHECK-NEXT: vmov r0, r1, d10 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d10, r0, r1 ; CHECK-NEXT: vmov q0, q4 diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll index a4d15a1b21d6b..f9d6663b5b8a3 100644 --- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll @@ -18,11 +18,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: movq 8(%rdi), %r18 ; EGPR-NEXT: movq 24(%rdi), %r29 ; EGPR-NEXT: movq 16(%rdi), %r17 +; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq 32(%rdi), %r10 +; EGPR-NEXT: movq 56(%rdi), %r15 ; EGPR-NEXT: movq 40(%rdi), %rdi -; EGPR-NEXT: movq 32(%r24), %r10 -; EGPR-NEXT: movq 56(%r24), %r15 ; EGPR-NEXT: movq 48(%r24), %r12 -; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: movq 24(%rsi), %r23 ; EGPR-NEXT: movq 16(%rsi), %r11 ; EGPR-NEXT: movq (%rsi), %r27 @@ -1295,15 +1295,22 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: adcq %rcx, %r29, %r8 ; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: adcq $0, %rsi, %r9 +; EGPR-NDD-NEXT: movq %r11, %r14 +; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq 48(%r11), %r11 +; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq 48(%r15), %r11 ; EGPR-NDD-NEXT: movq %r17, %rsi ; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: movq %rdx, %r28 ; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %r10, %rsi +; EGPR-NDD-NEXT: movq %rdx, %r28 +; EGPR-NDD-NEXT: movq %r16, %rax +; EGPR-NDD-NEXT: movq %r16, %r10 +; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 3fb994cdb751a..ab48bc292a40c 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -332,11 +332,11 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 48 +; CHECK-O3-NEXT: movq %rsi, (%rsp) ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; CHECK-O3-NEXT: movq %rsi, (%rsp) ; CHECK-O3-NEXT: movq %rsp, %rdx ; CHECK-O3-NEXT: movl $32, %edi ; CHECK-O3-NEXT: movq %rax, %rsi diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll index 07e86cb01e133..4937bcd110b78 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll @@ -392,10 +392,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> ; X86: # %bb.0: ; X86-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vmpsadbw $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xd9,0x02] ; X86-NEXT: vmpsadbw $3, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xe1,0x03] ; X86-NEXT: vmpsadbw $4, %zmm1, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xc9,0x42,0xd1,0x04] -; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86-NEXT: vmpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xc1,0x02] ; X86-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -403,10 +402,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> ; X64: # %bb.0: ; X64-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vmpsadbw $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xd9,0x02] ; X64-NEXT: vmpsadbw $3, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x49,0x42,0xe1,0x03] ; X64-NEXT: vmpsadbw $4, %zmm1, %zmm0, %zmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xc9,0x42,0xd1,0x04] -; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64-NEXT: vmpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x42,0xc1,0x02] ; X64-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X64-NEXT: retq # encoding: [0xc3] %msk = bitcast i32 %x4 to <32 x i1> diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll index 31cec891c4cf3..1a9eeaeefad96 100644 --- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll @@ -572,10 +572,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, ; X86: # %bb.0: ; X86-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vmpsadbw $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xd9,0x02] ; X86-NEXT: vmpsadbw $3, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xe1,0x03] ; X86-NEXT: vmpsadbw $4, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0x89,0x42,0xd1,0x04] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vmpsadbw $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x02] ; X86-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -583,10 +582,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, ; X64: # %bb.0: ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vmpsadbw $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xd9,0x02] ; X64-NEXT: vmpsadbw $3, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x09,0x42,0xe1,0x03] ; X64-NEXT: vmpsadbw $4, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0x89,0x42,0xd1,0x04] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vmpsadbw $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x02] ; X64-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X64-NEXT: retq # encoding: [0xc3] %msk = bitcast i8 %x4 to <8 x i1> @@ -606,10 +604,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> % ; X86: # %bb.0: ; X86-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vmpsadbw $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xd9,0x02] ; X86-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xe1,0x03] ; X86-NEXT: vmpsadbw $4, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xa9,0x42,0xd1,0x04] -; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86-NEXT: vmpsadbw $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x02] ; X86-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -617,10 +614,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_mask_mpsadbw_256(<32 x i8> % ; X64: # %bb.0: ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vmpsadbw $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xd9,0x02] ; X64-NEXT: vmpsadbw $3, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7e,0x29,0x42,0xe1,0x03] ; X64-NEXT: vmpsadbw $4, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7e,0xa9,0x42,0xd1,0x04] -; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64-NEXT: vmpsadbw $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x02] ; X64-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X64-NEXT: retq # encoding: [0xc3] %msk = bitcast i16 %x4 to <16 x i1> diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index b39b089faa2a5..5d97a56ce12c6 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -937,17 +937,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r13d -; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: movb %dl, 2(%rax) ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: andl $1, %r9d +; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: leal (%rdx,%r9,2), %r9d ; KNL-NEXT: kmovw %k1, %edx -; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: andl $1, %r8d ; KNL-NEXT: leal (%r9,%r8,4), %r9d +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: andl $1, %esi ; KNL-NEXT: leal (%r9,%rsi,8), %esi @@ -1250,17 +1250,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kmovd %k1, %r12d ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r13d -; SKX-NEXT: kshiftrd $14, %k0, %k1 ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: movb %dl, 2(%rax) ; SKX-NEXT: kmovd %k0, %edx ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: andl $1, %r9d +; SKX-NEXT: kshiftrd $14, %k0, %k1 ; SKX-NEXT: leal (%rdx,%r9,2), %r9d ; SKX-NEXT: kmovd %k1, %edx -; SKX-NEXT: kshiftrd $15, %k0, %k0 ; SKX-NEXT: andl $1, %r8d ; SKX-NEXT: leal (%r9,%r8,4), %r9d +; SKX-NEXT: kshiftrd $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %r8d ; SKX-NEXT: andl $1, %esi ; SKX-NEXT: leal (%r9,%rsi,8), %esi @@ -1563,56 +1563,56 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ecx -; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: movb %bl, 2(%eax) ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: andl $1, %ebp +; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 ; KNL_X32-NEXT: leal (%ebx,%ebp,2), %ebx ; KNL_X32-NEXT: kmovw %k1, %ebp -; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 ; KNL_X32-NEXT: andl $1, %esi ; KNL_X32-NEXT: leal (%ebx,%esi,4), %ebx +; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi ; KNL_X32-NEXT: leal (%ebx,%edi,8), %ebx +; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi -; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $4, %edx ; KNL_X32-NEXT: orl %ebx, %edx +; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx ; KNL_X32-NEXT: orl %edx, %ecx +; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx -; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $6, %ebp ; KNL_X32-NEXT: andl $1, %esi ; KNL_X32-NEXT: shll $7, %esi +; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 ; KNL_X32-NEXT: orl %ebp, %esi ; KNL_X32-NEXT: kmovw %k1, %ebp -; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi ; KNL_X32-NEXT: shll $8, %edi ; KNL_X32-NEXT: orl %esi, %edi +; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %esi -; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: shll $9, %ebx ; KNL_X32-NEXT: orl %edi, %ebx +; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi -; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $10, %edx ; KNL_X32-NEXT: orl %ebx, %edx +; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 ; KNL_X32-NEXT: orl %ecx, %edx +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, %ecx ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $11, %ebp @@ -1891,17 +1891,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kmovd %k1, %r12d ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r13d -; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: movb %dl, 2(%rax) ; FASTISEL-NEXT: kmovd %k0, %edx ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: andl $1, %r9d +; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 ; FASTISEL-NEXT: leal (%rdx,%r9,2), %r9d ; FASTISEL-NEXT: kmovd %k1, %edx -; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 ; FASTISEL-NEXT: andl $1, %r8d ; FASTISEL-NEXT: leal (%r9,%r8,4), %r9d +; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 ; FASTISEL-NEXT: kmovd %k0, %r8d ; FASTISEL-NEXT: andl $1, %esi ; FASTISEL-NEXT: leal (%r9,%rsi,8), %esi @@ -3113,22 +3113,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: kmovw %k1, %eax ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx -; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: andb $1, %dl ; KNL_X32-NEXT: addb %dl, %dl +; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 ; KNL_X32-NEXT: orb %bl, %dl ; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: shlb $2, %bl ; KNL_X32-NEXT: orb %dl, %bl +; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx -; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: andb $1, %dl ; KNL_X32-NEXT: shlb $3, %dl ; KNL_X32-NEXT: orb %bl, %dl +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: shlb $4, %bl diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index bafa33ff9a1c8..2cace3060def4 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -9,21 +9,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X86BW-LABEL: test_vgf2p8affineinvqb_128: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04] ; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_128: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04] ; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_128: @@ -70,21 +68,19 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X86BW-LABEL: test_vgf2p8affineinvqb_256: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04] ; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_256: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04] ; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_256: @@ -138,21 +134,19 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X86BW-LABEL: test_vgf2p8affineinvqb_512: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04] ; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_512: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04] ; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512: @@ -226,21 +220,19 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X86BW-LABEL: test_vgf2p8affineqb_128: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04] ; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_128: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04] ; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] -; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_128: @@ -287,21 +279,19 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X86BW-LABEL: test_vgf2p8affineqb_256: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04] ; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_256: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04] ; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] -; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_256: @@ -355,21 +345,19 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X86BW-LABEL: test_vgf2p8affineqb_512: ; X86BW: # %bb.0: ; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] +; X86BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X86BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04] ; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_512: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] ; X64BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] +; X64BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] +; X64BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04] ; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] -; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_512: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 2a77d0238721c..113828ae54ccd 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -313,10 +313,10 @@ define i16 @test15(ptr%addr) nounwind { define i16 @test16(ptr%addr, i16 %a) nounwind { ; KNL-LABEL: test16: ; KNL: ## %bb.0: -; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movw $-1025, %cx ## imm = 0xFBFF ; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -349,10 +349,10 @@ define i16 @test16(ptr%addr, i16 %a) nounwind { define i8 @test17(ptr%addr, i8 %a) nounwind { ; KNL-LABEL: test17: ; KNL: ## %bb.0: -; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movw $-17, %cx ; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -843,12 +843,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-LABEL: test_insertelement_v32i1: ; KNL: ## %bb.0: ; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 -; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx +; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: movw $-17, %dx ; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -862,12 +862,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-LABEL: test_insertelement_v32i1: ; SKX: ## %bb.0: ; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 ; SKX-NEXT: movl $-17, %ecx ; SKX-NEXT: kmovd %ecx, %k1 +; SKX-NEXT: setb %al ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 @@ -889,10 +889,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: cmpl %esi, %edi -; KNL-NEXT: setb %al ; KNL-NEXT: movw $-5, %cx ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} +; KNL-NEXT: setb %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 @@ -905,10 +905,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-LABEL: test_iinsertelement_v4i1: ; SKX: ## %bb.0: ; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: setb %al ; SKX-NEXT: movb $-5, %cl ; SKX-NEXT: kmovd %ecx, %k1 ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} +; SKX-NEXT: setb %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index b77c753107a6e..eda085b0a5c4a 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -1013,9 +1013,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X64-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; X64-NEXT: vmovdqa %ymm1, %ymm0 +; X64-NEXT: vpaddw %ymm1, %ymm2, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_x86_vcvtps2ph_256: @@ -1024,9 +1023,8 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vcvtps2ph $3, {sae}, %zmm0, %ymm2 {%k1} {z} ; X86-NEXT: vcvtps2ph $4, {sae}, %zmm0, %ymm1 {%k1} -; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; X86-NEXT: vcvtps2ph $2, %zmm0, (%eax) -; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: vpaddw %ymm1, %ymm2, %ymm0 ; X86-NEXT: retl %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask) diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 9e689341f7b88..373bc00d004bd 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1266,10 +1266,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi -; KNL-NEXT: setg %al ; KNL-NEXT: movw $-33, %cx ; KNL-NEXT: kmovw %ecx, %k4 ; KNL-NEXT: kandw %k4, %k0, %k0 +; KNL-NEXT: setg %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $10, %k4, %k4 @@ -1291,10 +1291,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi -; SKX-NEXT: setg %al ; SKX-NEXT: movq $-33, %rcx ; SKX-NEXT: kmovq %rcx, %k1 ; SKX-NEXT: kandq %k1, %k0, %k0 +; SKX-NEXT: setg %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlq $63, %k1, %k1 ; SKX-NEXT: kshiftrq $58, %k1, %k1 @@ -1306,10 +1306,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi -; AVX512BW-NEXT: setg %al ; AVX512BW-NEXT: movq $-33, %rcx ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: kandq %k1, %k0, %k0 +; AVX512BW-NEXT: setg %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 @@ -1329,10 +1329,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi -; AVX512DQ-NEXT: setg %al ; AVX512DQ-NEXT: movw $-33, %cx ; AVX512DQ-NEXT: kmovw %ecx, %k4 ; AVX512DQ-NEXT: kandw %k4, %k1, %k1 +; AVX512DQ-NEXT: setg %al ; AVX512DQ-NEXT: kmovw %eax, %k4 ; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 ; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 @@ -1355,11 +1355,11 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; X86-NEXT: setg %al ; X86-NEXT: kshiftrq $6, %k0, %k1 ; X86-NEXT: kshiftlq $6, %k1, %k1 ; X86-NEXT: kshiftlq $59, %k0, %k0 ; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: setg %al ; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kshiftlq $63, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 51ffeca52a665..9327ee800af19 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -215,11 +215,10 @@ declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32) define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) nounwind { ; CHECK-LABEL: test_int_x86_avx512_psll_dq_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xf8,0x08] -; CHECK-NEXT: # zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55] ; CHECK-NEXT: vpslldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x04] ; CHECK-NEXT: # zmm1 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] +; CHECK-NEXT: vpslldq $8, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xf8,0x08] +; CHECK-NEXT: # zmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8) %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4) @@ -251,11 +250,10 @@ declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32) define { <8 x i64>, <8 x i64> } @test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) nounwind { ; CHECK-LABEL: test_int_x86_avx512_psrl_dq_512: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrldq $8, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x73,0xd8,0x08] -; CHECK-NEXT: # zmm2 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpsrldq $4, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x04] ; CHECK-NEXT: # zmm1 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero -; CHECK-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] +; CHECK-NEXT: vpsrldq $8, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xd8,0x08] +; CHECK-NEXT: # zmm0 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8) %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4) @@ -1961,9 +1959,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] ; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] @@ -2136,9 +2134,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] ; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] +; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] @@ -3017,10 +3015,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X86-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4] -; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -3028,10 +3025,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X64-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4] -; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 3, <32 x i16> zeroinitializer, i32 %x4) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 41e2aa003ce7a..54f51be57536f 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -1244,10 +1244,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X86-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4] -; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -1255,10 +1254,9 @@ define { <32 x i16>, <32 x i16>, <32 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X64-NEXT: vmovdqa64 %zmm2, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %zmm1, %zmm0, %zmm2 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa64 %zmm4, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc4] -; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2) %2 = bitcast i32 %x4 to <32 x i1> diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index ae710cc40a522..1cc481174c003 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -6934,10 +6934,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128: @@ -6945,10 +6944,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4) %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 3, <8 x i16> zeroinitializer, i8 %x4) @@ -6967,10 +6965,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X86-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256: @@ -6978,10 +6975,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4) %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 3, <16 x i16> zeroinitializer, i16 %x4) diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index f76b96eda7540..e55e469391542 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1517,19 +1517,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmov_wb_128 ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3] ; X86-NEXT: vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1] ; X86-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovwb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc3] ; X64-NEXT: vpmovwb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1] ; X64-NEXT: vpmovwb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpmovwb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) @@ -1570,19 +1568,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovs_wb_12 ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3] ; X86-NEXT: vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1] ; X86-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc3] ; X64-NEXT: vpmovswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1] ; X64-NEXT: vpmovswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpmovswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) @@ -1623,19 +1619,17 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_int_x86_avx512_mask_pmovus_wb_1 ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3] ; X86-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1] ; X86-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2] -; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86-NEXT: vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovuswb %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc3] ; X64-NEXT: vpmovuswb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1] ; X64-NEXT: vpmovuswb %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2] -; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64-NEXT: vpmovuswb %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 -1) %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) @@ -1994,10 +1988,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_128: @@ -2005,10 +1998,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_dbpsadbw_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %xmm1, %xmm0, %xmm2 # encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2) %2 = bitcast i8 %x4 to <8 x i1> @@ -2031,10 +2023,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X86-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02] -; X86-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03] ; X86-NEXT: vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04] +; X86-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_256: @@ -2042,10 +2033,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_dbpsadbw ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vdbpsadbw $2, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x42,0xe1,0x02] -; X64-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x03] ; X64-NEXT: vdbpsadbw $4, %ymm1, %ymm0, %ymm2 # encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd1,0x04] +; X64-NEXT: vdbpsadbw $3, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc9,0x03] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2) %2 = bitcast i16 %x4 to <16 x i1> diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll index 1bcac8ff553d1..268dbee385727 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-upgrade.ll @@ -732,10 +732,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X86-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128: @@ -743,10 +742,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X64-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 23, <4 x i32> %x3, i8 -1) @@ -765,10 +763,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshld_d_25 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xe1,0x16] -; X86-NEXT: vpshldd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x17] ; X86-NEXT: vpshldd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x71,0xd1,0x18] +; X86-NEXT: vpshldd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_256: @@ -776,10 +773,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshld_d_25 ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x71,0xe1,0x16] -; X64-NEXT: vpshldd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xd9,0x17] ; X64-NEXT: vpshldd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x71,0xd1,0x18] +; X64-NEXT: vpshldd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshld.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 23, <8 x i32> %x3, i8 -1) @@ -798,10 +794,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshld_q_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xe1,0x16] -; X86-NEXT: vpshldq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x17] ; X86-NEXT: vpshldq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x71,0xd1,0x18] +; X86-NEXT: vpshldq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_128: @@ -809,10 +804,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshld_q_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x71,0xe1,0x16] -; X64-NEXT: vpshldq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xd9,0x17] ; X64-NEXT: vpshldq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x71,0xd1,0x18] +; X64-NEXT: vpshldq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshld.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 23, <2 x i64> %x3, i8 -1) @@ -831,10 +825,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshld_q_25 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xe1,0x16] -; X86-NEXT: vpshldq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x17] ; X86-NEXT: vpshldq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x71,0xd1,0x18] +; X86-NEXT: vpshldq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_q_256: @@ -842,10 +835,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshld_q_25 ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x71,0xe1,0x16] -; X64-NEXT: vpshldq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xd9,0x17] ; X64-NEXT: vpshldq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x71,0xd1,0x18] +; X64-NEXT: vpshldq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshld.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 23, <4 x i64> %x3, i8 -1) @@ -864,10 +856,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshld_w_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xe1,0x06] -; X86-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x07] ; X86-NEXT: vpshldw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x70,0xd1,0x08] +; X86-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc9,0x07] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_128: @@ -875,10 +866,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshld_w_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x70,0xe1,0x06] -; X64-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xd9,0x07] ; X64-NEXT: vpshldw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x70,0xd1,0x08] +; X64-NEXT: vpshldw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x70,0xc9,0x07] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 6, <8 x i16> %x3, i8 %x4) %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshld.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 7, <8 x i16> %x3, i8 -1) @@ -896,10 +886,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w ; X86-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshldw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xe1,0x06] -; X86-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x07] ; X86-NEXT: vpshldw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x70,0xd1,0x08] +; X86-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc9,0x07] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_w_256: @@ -907,10 +896,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshld_w ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x70,0xe1,0x06] -; X64-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xd9,0x07] ; X64-NEXT: vpshldw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x70,0xd1,0x08] +; X64-NEXT: vpshldw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x70,0xc9,0x07] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 6, <16 x i16> %x3, i16 %x4) %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshld.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 7, <16 x i16> %x3, i16 -1) @@ -929,10 +917,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X86-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: @@ -940,10 +927,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X64-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 23, <4 x i32> %x3, i8 -1) @@ -962,10 +948,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrd_d_25 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xe1,0x16] -; X86-NEXT: vpshrdd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x17] ; X86-NEXT: vpshrdd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x73,0xd1,0x18] +; X86-NEXT: vpshrdd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_256: @@ -973,10 +958,9 @@ define { <8 x i32>, <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpshrd_d_25 ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x73,0xe1,0x16] -; X64-NEXT: vpshrdd $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xd9,0x17] ; X64-NEXT: vpshrdd $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x73,0xd1,0x18] +; X64-NEXT: vpshrdd $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0x7d,0x28,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpshrd.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 23, <8 x i32> %x3, i8 -1) @@ -995,10 +979,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrd_q_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xe1,0x16] -; X86-NEXT: vpshrdq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x17] ; X86-NEXT: vpshrdq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x73,0xd1,0x18] +; X86-NEXT: vpshrdq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_128: @@ -1006,10 +989,9 @@ define { <2 x i64>, <2 x i64>, <2 x i64> } @test_int_x86_avx512_mask_vpshrd_q_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdq $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x73,0xe1,0x16] -; X64-NEXT: vpshrdq $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xd9,0x17] ; X64-NEXT: vpshrdq $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x73,0xd1,0x18] +; X64-NEXT: vpshrdq $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 22, <2 x i64> %x3, i8 %x4) %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpshrd.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 23, <2 x i64> %x3, i8 -1) @@ -1028,10 +1010,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrd_q_25 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xe1,0x16] -; X86-NEXT: vpshrdq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x17] ; X86-NEXT: vpshrdq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x73,0xd1,0x18] +; X86-NEXT: vpshrdq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_q_256: @@ -1039,10 +1020,9 @@ define { <4 x i64>, <4 x i64>, <4 x i64> } @test_int_x86_avx512_mask_vpshrd_q_25 ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdq $22, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x73,0xe1,0x16] -; X64-NEXT: vpshrdq $23, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xd9,0x17] ; X64-NEXT: vpshrdq $24, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x73,0xd1,0x18] +; X64-NEXT: vpshrdq $23, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpshrd.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 23, <4 x i64> %x3, i8 -1) @@ -1061,10 +1041,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrd_w_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xe1,0x06] -; X86-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x07] ; X86-NEXT: vpshrdw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x72,0xd1,0x08] +; X86-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc9,0x07] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_128: @@ -1072,10 +1051,9 @@ define { <8 x i16>, <8 x i16>, <8 x i16> } @test_int_x86_avx512_mask_vpshrd_w_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdw $6, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x72,0xe1,0x06] -; X64-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xd9,0x07] ; X64-NEXT: vpshrdw $8, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x72,0xd1,0x08] +; X64-NEXT: vpshrdw $7, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0xfd,0x08,0x72,0xc9,0x07] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 6, <8 x i16> %x3, i8 %x4) %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpshrd.w.128(<8 x i16> %x0, <8 x i16> %x1, i32 7, <8 x i16> %x3, i8 -1) @@ -1093,10 +1071,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshrd_w ; X86-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshrdw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xe1,0x06] -; X86-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x07] ; X86-NEXT: vpshrdw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x72,0xd1,0x08] +; X86-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc9,0x07] ; X86-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X86-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_w_256: @@ -1104,10 +1081,9 @@ define { <16 x i16>, <16 x i16>, <16 x i16> } @test_int_x86_avx512_mask_vpshrd_w ; X64-NEXT: vmovdqa %ymm2, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdw $6, %ymm1, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x72,0xe1,0x06] -; X64-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xd9,0x07] ; X64-NEXT: vpshrdw $8, %ymm1, %ymm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x72,0xd1,0x08] +; X64-NEXT: vpshrdw $7, %ymm1, %ymm0, %ymm1 # encoding: [0x62,0xf3,0xfd,0x28,0x72,0xc9,0x07] ; X64-NEXT: vmovdqa %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc4] -; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %res0 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 6, <16 x i16> %x3, i16 %x4) %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpshrd.w.256(<16 x i16> %x0, <16 x i16> %x1, i32 7, <16 x i16> %x3, i16 -1) @@ -1126,10 +1102,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X86-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128_2: @@ -1137,10 +1112,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X64-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpshld.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) %2 = bitcast i8 %x4 to <8 x i1> @@ -1301,10 +1275,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X86-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128_2: @@ -1312,10 +1285,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X64-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpshrd.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 22) %2 = bitcast i8 %x4 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll index 9223885730d04..abcb7d3f1182a 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll @@ -728,10 +728,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X86-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X86-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshld_d_128: @@ -739,10 +738,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshld_d_12 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshldd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x71,0xe1,0x16] -; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xd9,0x17] ; X64-NEXT: vpshldd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x71,0xd1,0x18] +; X64-NEXT: vpshldd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x71,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ) %2 = bitcast i8 %x4 to <8 x i1> @@ -897,10 +895,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_1 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X86-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X86-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X86-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X86-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpshrd_d_128: @@ -908,10 +905,9 @@ define { <4 x i32>, <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpshrd_d_1 ; X64-NEXT: vmovdqa %xmm2, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshrdd $22, %xmm1, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x73,0xe1,0x16] -; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xd9,0x17] ; X64-NEXT: vpshrdd $24, %xmm1, %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x73,0xd1,0x18] +; X64-NEXT: vpshrdd $23, %xmm1, %xmm0, %xmm1 # encoding: [0x62,0xf3,0x7d,0x08,0x73,0xc9,0x17] ; X64-NEXT: vmovdqa %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc4] -; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> ) %2 = bitcast i8 %x4 to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index c0bb0037923dc..21c26f3cd78ba 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -10720,9 +10720,9 @@ declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8) define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_d_128: ; X86: # %bb.0: -; X86-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10779,9 +10779,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8) define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_q_128: ; X86: # %bb.0: -; X86-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10810,9 +10810,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8) define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_q_256: ; X86: # %bb.0: -; X86-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10843,9 +10843,9 @@ declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_d_128: ; X86: # %bb.0: -; X86-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10902,9 +10902,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_q_128: ; X86: # %bb.0: -; X86-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10933,9 +10933,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_q_256: ; X86: # %bb.0: -; X86-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 1d3b015f3c547..386b63d551988 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -178,13 +178,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: subl $156, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: xorl %eax, %esi ; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %ecx, %edx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 58ea70e58028f..8a1d247543cf9 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -208,11 +208,11 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebp, %ebp ; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl $31, %ebp ; X86-NEXT: orl $32, %ebp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %edx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl $64, %ebp ; X86-NEXT: movl %edi, %edx ; X86-NEXT: orl %ebx, %edx diff --git a/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll b/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll index dfbd7278ce432..fbd2a6752dfd5 100644 --- a/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll +++ b/llvm/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll @@ -96,9 +96,8 @@ define void @test_memcpy_args(ptr %Storage) { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movq 8(%rdi), %rsi -; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: movq (%rdi), %rdi ; CHECK-NEXT: movl $1024, %edx # imm = 0x400 ; CHECK-NEXT: callq __llvm_memcpy_element_unordered_atomic_4@PLT ; CHECK-NEXT: popq %rax @@ -210,9 +209,8 @@ define void @test_memmove_args(ptr %Storage) { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rax ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movq 8(%rdi), %rsi -; CHECK-NEXT: movq %rax, %rdi +; CHECK-NEXT: movq (%rdi), %rdi ; CHECK-NEXT: movl $1024, %edx # imm = 0x400 ; CHECK-NEXT: callq __llvm_memmove_element_unordered_atomic_4@PLT ; CHECK-NEXT: popq %rax diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll index 0a52dfff71eda..c483528ab327f 100644 --- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll @@ -154,9 +154,8 @@ define <4 x i64> @vsext_v4i64_v4i1(<4 x i1> %va, <4 x i1> %m, i32 zeroext %evl) ; SSE: # %bb.0: ; SSE-NEXT: pslld $31, %xmm0 ; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: vsext_v4i64_v4i1: diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index 90e075bfabf0a..c10a1ae9b8fb7 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -34,12 +34,11 @@ define i32 @bextr32_a0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X86-NOBMI-LABEL: bextr32_a0: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -99,12 +98,11 @@ define i32 @bextr32_a0_arithmetic(i32 %val, i32 %numskipbits, i32 %numlowbits) n ; X86-NOBMI-LABEL: bextr32_a0_arithmetic: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: sarl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -166,12 +164,11 @@ define i32 @bextr32_a1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI-LABEL: bextr32_a1_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -232,10 +229,10 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-LABEL: bextr32_a2_load: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: movl %edx, %ecx @@ -302,10 +299,10 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-LABEL: bextr32_a3_load_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: movl $1, %eax ; X86-NOBMI-NEXT: movl %edx, %ecx @@ -373,12 +370,11 @@ define i32 @bextr32_a4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X86-NOBMI-LABEL: bextr32_a4_commutative: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -440,10 +436,10 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: pushl %eax -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: movl $1, %esi ; X86-NOBMI-NEXT: movl %edx, %ecx @@ -2216,12 +2212,11 @@ define i32 @bextr32_b0(i32 %val, i32 %numskipbits, i32 %numlowbits) nounwind { ; X86-NOBMI-LABEL: bextr32_b0: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -2281,12 +2276,11 @@ define i32 @bextr32_b1_indexzext(i32 %val, i8 zeroext %numskipbits, i8 zeroext % ; X86-NOBMI-LABEL: bextr32_b1_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -2347,10 +2341,10 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-LABEL: bextr32_b2_load: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl %edx, %ecx @@ -2417,10 +2411,10 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-LABEL: bextr32_b3_load_indexzext: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi ; X86-NOBMI-NEXT: movl $-1, %eax ; X86-NOBMI-NEXT: movl %edx, %ecx @@ -2488,12 +2482,11 @@ define i32 @bextr32_b4_commutative(i32 %val, i32 %numskipbits, i32 %numlowbits) ; X86-NOBMI-LABEL: bextr32_b4_commutative: ; X86-NOBMI: # %bb.0: ; X86-NOBMI-NEXT: pushl %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -2555,10 +2548,10 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: pushl %edi ; X86-NOBMI-NEXT: pushl %esi ; X86-NOBMI-NEXT: pushl %eax -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %ecx +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %edi ; X86-NOBMI-NEXT: movl $-1, %esi ; X86-NOBMI-NEXT: movl %edx, %ecx diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll index 62466bfa98ec2..128e3d2890963 100644 --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -584,11 +584,10 @@ define <4 x i64> @eq_or_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE41-NEXT: pcmpeqq %xmm4, %xmm1 ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: eq_or_to_abs_vec4x64_sext: @@ -724,11 +723,10 @@ define <4 x i64> @ne_and_to_abs_vec4x64_sext(<4 x i64> %x) { ; SSE41-NEXT: por %xmm2, %xmm1 ; SSE41-NEXT: packssdw %xmm1, %xmm0 ; SSE41-NEXT: pxor %xmm4, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: ne_and_to_abs_vec4x64_sext: diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll index cc4d4c4543a51..b909a6b8139e3 100644 --- a/llvm/test/CodeGen/X86/is_fpclass.ll +++ b/llvm/test/CodeGen/X86/is_fpclass.ll @@ -1469,13 +1469,12 @@ define <2 x i1> @isnan_v2f(<2 x float> %x) { ; X86-NEXT: fnstsw %ax ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf -; X86-NEXT: setp %cl ; X86-NEXT: fucomp %st(0) ; X86-NEXT: fnstsw %ax ; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: setp %al ; X86-NEXT: sahf ; X86-NEXT: setp %dl -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: isnan_v2f: @@ -1498,13 +1497,12 @@ define <2 x i1> @isnot_nan_v2f(<2 x float> %x) { ; X86-NEXT: fnstsw %ax ; X86-NEXT: # kill: def $ah killed $ah killed $ax ; X86-NEXT: sahf -; X86-NEXT: setnp %cl ; X86-NEXT: fucomp %st(0) ; X86-NEXT: fnstsw %ax ; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: setnp %al ; X86-NEXT: sahf ; X86-NEXT: setnp %dl -; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; ; X64-LABEL: isnot_nan_v2f: diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll index 0aa376195627d..b24fa93ef4ecb 100644 --- a/llvm/test/CodeGen/X86/ldexp.ll +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -353,10 +353,9 @@ define <2 x double> @ldexp_v2f64(<2 x double> %val, <2 x i32> %exp) { ; WIN64-NEXT: .seh_savexmm %xmm6, 32 ; WIN64-NEXT: .seh_endprologue ; WIN64-NEXT: movaps (%rcx), %xmm6 -; WIN64-NEXT: movl (%rdx), %eax ; WIN64-NEXT: movl 4(%rdx), %esi +; WIN64-NEXT: movl (%rdx), %edx ; WIN64-NEXT: movaps %xmm6, %xmm0 -; WIN64-NEXT: movl %eax, %edx ; WIN64-NEXT: callq ldexp ; WIN64-NEXT: movaps %xmm0, %xmm7 ; WIN64-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1] diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index 5e168a82e03e7..ed09823b2b515 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -77,13 +77,13 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind { ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shldl $28, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shldl $28, %ebx, %edx ; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: shldl $28, %ecx, %ebx @@ -159,13 +159,13 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind { ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shldl $26, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shldl $26, %ebx, %edx ; X86-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-NEXT: shldl $26, %ecx, %ebx diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index ef85cd146d65f..1f4aa669a67e5 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1263,15 +1263,14 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: movapd %xmm4, %xmm14 ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm10, %xmm14 -; SSE-NEXT: movapd %xmm6, %xmm4 ; SSE-NEXT: mulpd %xmm6, %xmm13 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm6, %xmm8 ; SSE-NEXT: movapd %xmm7, %xmm10 ; SSE-NEXT: mulpd %xmm8, %xmm10 ; SSE-NEXT: addpd %xmm13, %xmm10 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulpd %xmm6, %xmm8 ; SSE-NEXT: addpd %xmm14, %xmm8 ; SSE-NEXT: movapd %xmm12, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0] @@ -1289,8 +1288,8 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: movapd %xmm5, %xmm14 ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm12, %xmm14 -; SSE-NEXT: mulpd %xmm4, %xmm13 -; SSE-NEXT: movapd %xmm4, %xmm2 +; SSE-NEXT: mulpd %xmm6, %xmm13 +; SSE-NEXT: movapd %xmm6, %xmm2 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] ; SSE-NEXT: movapd %xmm7, %xmm12 @@ -1644,7 +1643,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm14 ; SSE-NEXT: movaps %xmm14, %xmm15 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,0],xmm14[0,0] @@ -1655,17 +1653,18 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] ; SSE-NEXT: movaps %xmm3, %xmm10 ; SSE-NEXT: movaps %xmm3, %xmm12 +; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: mulps %xmm0, %xmm10 ; SSE-NEXT: addps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm2, %xmm0 -; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2] ; SSE-NEXT: movaps %xmm4, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm2 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulps %xmm11, %xmm1 ; SSE-NEXT: addps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] @@ -1914,8 +1913,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: mulps %xmm0, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: mulps %xmm6, %xmm0 +; SSE-NEXT: movaps %xmm6, %xmm3 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -1956,7 +1956,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] -; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm14, %xmm15 @@ -2750,7 +2749,6 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vmulps %ymm6, %ymm8, %ymm6 ; AVX512F-NEXT: vaddps %ymm6, %ymm4, %ymm6 ; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload -; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload ; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3] @@ -2799,8 +2797,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vmulps %ymm1, %ymm10, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1 -; AVX512F-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX512F-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload +; AVX512F-NEXT: vbroadcastss %xmm1, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -3294,7 +3293,8 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movq %rdi, %rax -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm14 +; SSE-NEXT: movapd %xmm1, %xmm9 +; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm11 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: movapd %xmm13, %xmm12 @@ -3303,7 +3303,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: mulpd %xmm12, %xmm10 ; SSE-NEXT: movapd %xmm2, %xmm8 ; SSE-NEXT: mulpd %xmm12, %xmm8 -; SSE-NEXT: movapd %xmm1, %xmm9 ; SSE-NEXT: mulpd %xmm12, %xmm9 ; SSE-NEXT: mulpd %xmm0, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1,1] @@ -3321,7 +3320,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm12, %xmm13 ; SSE-NEXT: movapd %xmm11, %xmm6 ; SSE-NEXT: unpcklpd {{.*#+}} xmm6 = xmm6[0],xmm11[0] -; SSE-NEXT: movapd %xmm14, %xmm1 ; SSE-NEXT: mulpd %xmm6, %xmm1 ; SSE-NEXT: addpd %xmm13, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 @@ -3530,16 +3528,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm13, %xmm8 +; SSE-NEXT: movapd %xmm10, %xmm5 ; SSE-NEXT: movapd %xmm13, %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm5 +; SSE-NEXT: addpd %xmm5, %xmm4 +; SSE-NEXT: movapd %xmm13, %xmm8 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm10, %xmm5 ; SSE-NEXT: movapd %xmm10, %xmm15 -; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 ; SSE-NEXT: movapd %xmm12, %xmm10 ; SSE-NEXT: mulpd %xmm12, %xmm0 -; SSE-NEXT: movapd %xmm14, %xmm9 ; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3589,24 +3587,24 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm6, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 -; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm2, %xmm1 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] +; SSE-NEXT: movapd %xmm14, %xmm9 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movapd %xmm7, %xmm3 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm3, %xmm2 +; SSE-NEXT: mulpd %xmm7, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm3, %xmm1 +; SSE-NEXT: mulpd %xmm7, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm3, %xmm5 +; SSE-NEXT: mulpd %xmm7, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm4, %xmm3 @@ -3751,7 +3749,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm11 -; SSE-NEXT: movapd %xmm13, %xmm6 ; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm11, %xmm4 @@ -3795,6 +3792,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulpd %xmm9, %xmm3 +; SSE-NEXT: movapd %xmm13, %xmm6 ; SSE-NEXT: addpd %xmm0, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: mulpd %xmm9, %xmm10 @@ -3855,16 +3853,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm12, %xmm2 ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm3 -; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: mulpd %xmm1, %xmm7 -; SSE-NEXT: addpd %xmm3, %xmm7 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm3 +; SSE-NEXT: mulpd %xmm1, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm0, %xmm3 ; SSE-NEXT: movapd %xmm4, %xmm3 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: addpd %xmm4, %xmm7 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: movapd %xmm6, %xmm9 ; SSE-NEXT: mulpd %xmm1, %xmm9 ; SSE-NEXT: addpd %xmm3, %xmm9 diff --git a/llvm/test/CodeGen/X86/mul-i1024.ll b/llvm/test/CodeGen/X86/mul-i1024.ll index 0299773aa67ad..4f4c46c32868b 100644 --- a/llvm/test/CodeGen/X86/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/mul-i1024.ll @@ -5292,6 +5292,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: mulq %r14 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; X64-NEXT: movq %r11, %r13 ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; X64-NEXT: movq %r12, %rax ; X64-NEXT: mulq %r14 @@ -5300,7 +5301,6 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %r8, %r14 ; X64-NEXT: adcq $0, %r10 ; X64-NEXT: movq %r9, %rax -; X64-NEXT: movq %r11, %r13 ; X64-NEXT: mulq %r11 ; X64-NEXT: movq %rdx, %r8 ; X64-NEXT: addq %r14, %rax diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 2d7737bfdd3c2..65bd1df582104 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -304,13 +304,13 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq (%rdi), %rbx ; X64-NEXT: movq 8(%rdi), %r11 ; X64-NEXT: movq 16(%rdi), %r10 -; X64-NEXT: movq 16(%rsi), %r8 ; X64-NEXT: movq (%rsi), %r9 ; X64-NEXT: movq 8(%rsi), %r14 ; X64-NEXT: movq 24(%rdi), %r15 ; X64-NEXT: imulq %r9, %r15 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 +; X64-NEXT: movq 16(%rsi), %r8 ; X64-NEXT: movq %rax, %rdi ; X64-NEXT: imulq %r14, %r10 ; X64-NEXT: addq %rdx, %r10 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index 64f6746e616ed..355f770fd1704 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -1218,11 +1218,11 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: movq %rdx, %r12 -; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll index f3741dc202dc5..d7b8734e96e39 100644 --- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -277,12 +277,12 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 ; CHECK32-NEXT: pushl %esi ; CHECK32-NEXT: pushl %eax ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebp -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK32-NEXT: lock cmpxchg8b (%esi) ; CHECK32-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 2b475644a38cf..f390c919f46d2 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -1307,10 +1307,9 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm2, %ymm1 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index c2a009f06b89d..b60b36744f038 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -1111,10 +1111,10 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-LABEL: zext_mulhuw_v32i16_lshr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -1206,10 +1206,10 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-LABEL: mulhsw_v32i16_lshr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -1307,10 +1307,10 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-LABEL: mulhsw_v32i16_ashr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm1 ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: retq @@ -1476,7 +1476,6 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-LABEL: zext_mulhuw_v64i16_lshr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm4 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 @@ -1485,6 +1484,7 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmulhuw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 @@ -1657,7 +1657,6 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-LABEL: mulhsw_v64i16_lshr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 @@ -1666,6 +1665,7 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 @@ -1839,7 +1839,6 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-LABEL: mulhsw_v64i16_ashr: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm4 -; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm2, %ymm0, %ymm0 @@ -1848,6 +1847,7 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm3 ; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 diff --git a/llvm/test/CodeGen/X86/pointer-vector.ll b/llvm/test/CodeGen/X86/pointer-vector.ll index aa9b977482fc1..0e8c596783e57 100644 --- a/llvm/test/CodeGen/X86/pointer-vector.ll +++ b/llvm/test/CodeGen/X86/pointer-vector.ll @@ -5,9 +5,8 @@ define <8 x ptr> @SHUFF0(<4 x ptr> %ptrv) nounwind { ; CHECK-LABEL: SHUFF0: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,1,2] ; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,1] -; CHECK-NEXT: movdqa %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,1,2] ; CHECK-NEXT: retl entry: %G = shufflevector <4 x ptr> %ptrv, <4 x ptr> %ptrv, <8 x i32> diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll index b0aa566a8235f..bec10ac80f1cc 100644 --- a/llvm/test/CodeGen/X86/pr11334.ll +++ b/llvm/test/CodeGen/X86/pr11334.ll @@ -63,11 +63,10 @@ define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind { ; SSE-NEXT: cvtps2pd %xmm0, %xmm5 ; SSE-NEXT: cvtps2pd %xmm1, %xmm2 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvtps2pd %xmm0, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: cvtps2pd %xmm1, %xmm3 +; SSE-NEXT: cvtps2pd %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: retq ; ; AVX-LABEL: v8f2d_ext_vec: diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index 29922c2ac1a71..ea72b64ee00c0 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -49,13 +49,13 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-LABEL: test: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 -; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $2, %al ; AVX512VL-NEXT: fld1 ; AVX512VL-NEXT: fldz ; AVX512VL-NEXT: fld %st(0) ; AVX512VL-NEXT: fcmovne %st(2), %st +; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll index 1949841ea216b..4fea9c8cffec6 100644 --- a/llvm/test/CodeGen/X86/pr61964.ll +++ b/llvm/test/CodeGen/X86/pr61964.ll @@ -18,11 +18,11 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq @@ -31,10 +31,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] -; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: splitTransposeDecode_8_avx2: @@ -55,11 +54,11 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 ; XOPAVX1-NEXT: vmovaps %ymm2, %ymm0 ; XOPAVX1-NEXT: retq @@ -68,10 +67,9 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] -; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; XOPAVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 -; XOPAVX2-NEXT: vmovdqa %ymm2, %ymm0 +; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm0 ; XOPAVX2-NEXT: retq %shuffle.i = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> %shuffle.i59 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 4fbe05cd1b2f2..779999816ebbf 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -406,7 +406,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $92, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -429,6 +428,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: sarl $31, %eax ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -550,16 +550,16 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: subl $100, %esp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx ; i686-NEXT: movl {{[0-9]+}}(%esp), %edx +; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/sibcall.ll b/llvm/test/CodeGen/X86/sibcall.ll index 4a0a68ee32243..d5baa7459ff80 100644 --- a/llvm/test/CodeGen/X86/sibcall.ll +++ b/llvm/test/CodeGen/X86/sibcall.ll @@ -714,9 +714,8 @@ define fastcc void @t21_sret_to_sret_more_args2(ptr noalias sret(%struct.foo) %a ; X86-NEXT: pushl %esi ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %edx, (%esp) -; X86-NEXT: movl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: calll f_sret@PLT ; X86-NEXT: movl %esi, %eax ; X86-NEXT: addl $8, %esp diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index ce56283df6010..0771beecda770 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -167,10 +167,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 85c966c447fad..14db7ac90ef57 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -266,10 +266,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %ecx ; X86-NEXT: cmpl $2, %edx ; X86-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF @@ -277,7 +277,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmpl $-2, %edx ; X86-NEXT: movl $-2147483648, %esi # imm = 0x80000000 ; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: movl %edi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %edi ; X86-NEXT: shrdl $2, %edx, %edi diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index b2b5bcc5b44b2..8a5d3bb093677 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -89,12 +89,12 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx @@ -583,12 +583,12 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ebx +; X86-NEXT: movl %edi, %eax ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %ebx @@ -1293,9 +1293,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: mull {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: movl %esi, %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index f1fd05565c47e..554548fa8f4c3 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -1659,17 +1659,19 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 13(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 12(%rdi), %r15d ; SCALAR-NEXT: movzbl 11(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: movzbl 10(%rdi), %ebp +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movzbl 9(%rdi), %r14d ; SCALAR-NEXT: movzbl 8(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 7(%rdi), %r12d -; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: movzbl 5(%rdi), %r9d ; SCALAR-NEXT: movzbl 4(%rdi), %ebx +; SCALAR-NEXT: movzbl 12(%rdi), %r15d ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax @@ -1680,11 +1682,9 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill @@ -4756,8 +4756,10 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 8(%rdi), %r14d ; SCALAR-NEXT: movzbl 7(%rdi), %ebx ; SCALAR-NEXT: movzbl 6(%rdi), %r10d -; SCALAR-NEXT: movzbl 5(%rdi), %r15d ; SCALAR-NEXT: movzbl 4(%rdi), %r9d +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl 5(%rdi), %r15d ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax @@ -4769,8 +4771,6 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movl %r15d, %r9d ; SCALAR-NEXT: notb %r9b @@ -6694,8 +6694,10 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 8(%rdi), %r14d ; SCALAR-NEXT: movzbl 7(%rdi), %ebp ; SCALAR-NEXT: movzbl 6(%rdi), %r11d -; SCALAR-NEXT: movzbl 5(%rdi), %ebx ; SCALAR-NEXT: movzbl 4(%rdi), %r9d +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: notb %r9b +; SCALAR-NEXT: movzbl 5(%rdi), %ebx ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax @@ -6707,8 +6709,6 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movl %ebx, %r9d ; SCALAR-NEXT: notb %r9b @@ -6761,12 +6761,13 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r11b, 15(%rdx) ; SCALAR-NEXT: movb %r8b, 14(%rdx) ; SCALAR-NEXT: movb %al, 13(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 4(%rdx) ; SCALAR-NEXT: movb %al, 12(%rdx) +; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; SCALAR-NEXT: movb %r13b, 11(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r15b, 10(%rdx) -; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SCALAR-NEXT: movb %sil, 9(%rdx) ; SCALAR-NEXT: movb %r12b, 8(%rdx) ; SCALAR-NEXT: movb %r14b, 7(%rdx) @@ -6775,13 +6776,13 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movl %r9d, %r11d ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 4(%rdx) ; SCALAR-NEXT: movb %bpl, 3(%rdx) ; SCALAR-NEXT: movb %dil, 2(%rdx) ; SCALAR-NEXT: movb %cl, 1(%rdx) ; SCALAR-NEXT: movl %ecx, %r14d -; SCALAR-NEXT: movl %r10d, %esi ; SCALAR-NEXT: movb %r10b, (%rdx) +; SCALAR-NEXT: movb %al, 22(%rdx) +; SCALAR-NEXT: movl %r10d, %esi ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SCALAR-NEXT: movb %cl, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload @@ -6799,7 +6800,6 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload ; SCALAR-NEXT: movb %bl, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload -; SCALAR-NEXT: movb %al, 22(%rdx) ; SCALAR-NEXT: movb %r11b, 21(%rdx) ; SCALAR-NEXT: movb %r8b, 20(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll index ccabb360a990c..939452c98e0e3 100644 --- a/llvm/test/CodeGen/X86/umul-with-overflow.ll +++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll @@ -88,13 +88,12 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull %edi -; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %esi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %edi @@ -341,8 +340,8 @@ define i300 @test4(i300 %a, i300 %b) nounwind { ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: addl %edi, %ebp ; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: mull %edi ; X86-NEXT: movl %edi, %esi ; X86-NEXT: movl %edx, %ebx diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index eacc714b49a4d..f357e57b30599 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -123,10 +123,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index 8c7078c726328..f40276822b3fa 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -195,15 +195,14 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %esi ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovael %ecx, %esi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shrdl $2, %edx, %ebx @@ -392,13 +391,12 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %edi ; X86-NEXT: cmovol %edi, %esi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: cmovol %edi, %ebx diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 82603b35ba712..901ad0acae21b 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -48,12 +48,11 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: mull %ecx ; X86-NEXT: movl %ecx, %ebx ; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull %edi ; X86-NEXT: leal (%ecx,%eax), %esi ; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index af841cf38b24a..fe8419d51e705 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -2686,10 +2686,9 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_8i16_to_8f32: @@ -2732,10 +2731,9 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_8i8_to_8f32: @@ -2780,10 +2778,9 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: uitofp_16i8_to_8f32: @@ -5162,16 +5159,14 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, ptr %p) nounwind { ; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2: ; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 ; SSE-NEXT: movss %xmm0, (%rdi) -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: ; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1 ; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: vmovaps %xmm1, %xmm0 +; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 %r = sitofp i32 %e to float diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index 460c5fe11f82a..18ce750492794 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -593,7 +593,8 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -606,8 +607,6 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: saddo_v16i8: @@ -769,9 +768,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE2-NEXT: movdqa %xmm0, (%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: saddo_v2i64: @@ -792,9 +790,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSSE3-NEXT: pxor %xmm2, %xmm2 ; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 ; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSSE3-NEXT: movdqa %xmm0, (%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: saddo_v2i64: @@ -815,9 +812,8 @@ define <2 x i32> @saddo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 ; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: saddo_v2i64: diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index d06993da6365d..1688bc31f7e53 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -598,7 +598,8 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 +; SSE41-NEXT: movdqa %xmm0, (%rdi) +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -611,8 +612,6 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: ssubo_v16i8: diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 6311678924d06..00898ab313a3c 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -195,11 +195,10 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v3i32: @@ -213,11 +212,10 @@ define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) ; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v3i32: @@ -313,10 +311,9 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vmovdqa %xmm2, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v4i32: @@ -330,10 +327,9 @@ define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %xmm2, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v4i32: @@ -529,12 +525,11 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: vmovq %xmm1, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm0 +; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v6i32: @@ -548,10 +543,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vmovq %xmm1, 16(%rdi) +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) ; AVX2-NEXT: vmovdqa %ymm2, %ymm0 ; AVX2-NEXT: retq @@ -701,12 +696,11 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7] ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm0 +; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v8i32: @@ -720,10 +714,9 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi) -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll index 63ca7c6a00573..bac925c040763 100644 --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -165,11 +165,10 @@ define <8 x double> @interleave2x4f64(<4 x double> %a, <4 x double> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] ; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[3],ymm1[3] -; AVX2-NEXT: vmovapd %ymm2, %ymm0 +; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] ; AVX2-NEXT: retq %result = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> ret <8 x double> %result @@ -203,11 +202,10 @@ define <8 x i64> @interleave2x4i64(<4 x i64> %a, <4 x i64> %b) { ; AVX2: # %bb.0: ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vmovaps %ymm2, %ymm0 +; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX2-NEXT: retq %result = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> ret <8 x i64> %result diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 00e43df15deea..e71aa794640f7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -946,15 +946,15 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa (%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 ; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: packssdw %xmm0, %xmm13 ; SSE-NEXT: movdqa %xmm7, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index f105e065866af..26f2f17ad2404 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1232,12 +1232,10 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm13 -; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm10 @@ -1246,20 +1244,21 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] +; SSE-NEXT: movdqa 160(%rdi), %xmm9 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1268,16 +1267,17 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] +; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index df28ac14a30c0..0f9f83bafdf93 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1879,9 +1879,9 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%rdi), %xmm10 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index b18f08b62f0d4..22262b414df1a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -2411,21 +2411,21 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa (%rdi), %xmm10 ; SSE-NEXT: movdqa 16(%rdi), %xmm13 ; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa 224(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] @@ -3673,12 +3673,12 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4887,21 +4887,21 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 ; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm8 +; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa 416(%rdi), %xmm11 ; SSE-NEXT: movdqa 448(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 144(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm15 ; SSE-NEXT: movdqa 96(%rdi), %xmm10 ; SSE-NEXT: movdqa 128(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: movdqa 400(%rdi), %xmm8 ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -6560,9 +6560,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2: # %bb.0: ; AVX2-NEXT: subq $1048, %rsp # imm = 0x418 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -6574,6 +6573,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index 605deed6536bf..a53ca1ccaa668 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1579,9 +1579,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i16_stride6_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $136, %rsp -; SSE-NEXT: movdqa 112(%rdi), %xmm9 -; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm2 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1592,10 +1591,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -1625,6 +1625,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm9, %xmm11 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] @@ -1657,11 +1658,12 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: # xmm9 = mem[0,1,2,3,5,7,6,7] ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm13[2],xmm9[3],xmm13[3] ; SSE-NEXT: movdqa %xmm10, %xmm13 +; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pandn %xmm15, %xmm13 ; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] ; SSE-NEXT: movdqa %xmm11, %xmm4 @@ -1672,8 +1674,6 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: por %xmm2, %xmm10 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 @@ -4161,13 +4161,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] @@ -11381,8 +11381,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24 -; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0 +; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0] ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm19 ; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm0 @@ -11551,8 +11551,8 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm14 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0 +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm29, %ymm10, %ymm14 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index af340d15fe8f6..ec9f87b201a95 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -2027,21 +2027,21 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 144(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: movdqa 208(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: movdqa %xmm3, %xmm10 @@ -2067,7 +2067,6 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE-NEXT: pand %xmm1, %xmm9 ; SSE-NEXT: por %xmm2, %xmm9 ; SSE-NEXT: movdqa %xmm10, %xmm2 @@ -4151,20 +4150,20 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 128(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm7 -; SSE-NEXT: movaps 144(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm12 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] +; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] @@ -7703,6 +7702,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15] +; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12 @@ -7714,7 +7714,6 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -8631,12 +8630,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm0 @@ -13252,23 +13251,23 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i16_stride7_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] @@ -14102,13 +14101,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7] -; AVX512-NEXT: vmovdqa %ymm5, %ymm6 ; AVX512-NEXT: vmovdqa %ymm4, %ymm8 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21 @@ -14127,7 +14127,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7] ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17 -; AVX512-NEXT: vmovdqa64 %ymm6, %ymm19 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7] ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -14175,7 +14174,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm16 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX512-NEXT: vmovdqa 688(%rdi), %xmm3 @@ -14481,7 +14479,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa %ymm1, %ymm15 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm25 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] @@ -14489,6 +14486,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] +; AVX512-NEXT: vmovdqa %ymm1, %ymm15 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -15019,6 +15017,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6 @@ -15072,7 +15071,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7] ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128] @@ -17012,7 +17010,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm22 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7] @@ -17025,6 +17022,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] @@ -17085,7 +17083,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7] ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index 1b637cd203c8f..ff1e9cf28f2ea 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -3458,20 +3458,20 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm7 ; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, %xmm7 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4473,8 +4473,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -4497,7 +4498,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -4934,8 +4934,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -4958,7 +4959,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -5395,8 +5395,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -5419,7 +5420,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -6261,7 +6261,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] @@ -6289,6 +6288,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] @@ -7034,7 +7034,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3] @@ -7062,6 +7061,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] @@ -12886,8 +12886,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] ; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512-NEXT: vmovdqa 560(%rdi), %xmm1 @@ -12973,13 +12975,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vmovdqa %xmm8, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm4 @@ -13055,7 +13056,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -13389,10 +13389,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] @@ -13455,7 +13456,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -13852,7 +13852,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 @@ -13861,8 +13860,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] @@ -14726,8 +14726,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512DQ-NEXT: vmovdqa 560(%rdi), %xmm1 @@ -14813,13 +14815,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm4 @@ -14895,7 +14896,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -15229,10 +15229,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] +; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] @@ -15295,7 +15296,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -15692,7 +15692,6 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 @@ -15701,8 +15700,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index afdeebc45ed0a..65e3ba8b8200b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -727,10 +727,9 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 144(%rdi), %xmm11 ; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: movaps 112(%rdi), %xmm13 -; SSE-NEXT: movaps 144(%rdi), %xmm11 ; SSE-NEXT: movaps 176(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm7 @@ -739,16 +738,17 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm15 ; SSE-NEXT: movaps 80(%rdi), %xmm14 -; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] ; SSE-NEXT: movaps %xmm15, %xmm5 +; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm8[0,2] +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] @@ -1198,31 +1198,31 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $392, %rsp # imm = 0x188 -; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 240(%rdi), %xmm7 ; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps 208(%rdi), %xmm14 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: movaps 208(%rdi), %xmm14 ; SSE-NEXT: movaps 272(%rdi), %xmm6 ; SSE-NEXT: movaps 256(%rdi), %xmm9 -; SSE-NEXT: movaps (%rdi), %xmm13 +; SSE-NEXT: movaps 48(%rdi), %xmm2 +; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm4 ; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps 80(%rdi), %xmm1 +; SSE-NEXT: movaps (%rdi), %xmm13 ; SSE-NEXT: movaps %xmm1, %xmm12 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,2] +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] ; SSE-NEXT: movaps %xmm6, %xmm10 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1259,6 +1259,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 352(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] +; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movaps 336(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1288,7 +1289,6 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, %xmm14 ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] @@ -1743,7 +1743,6 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride3_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $72, %rsp -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 @@ -1764,6 +1763,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm3 +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] @@ -2145,19 +2145,19 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 272(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 256(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rdi), %xmm9 ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: movaps 240(%rdi), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0] +; SSE-NEXT: movaps 256(%rdi), %xmm13 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index 3874581e621b3..aa23dcc824c72 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -1030,7 +1030,6 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX: # %bb.0: ; AVX-NEXT: subq $264, %rsp # imm = 0x108 ; AVX-NEXT: vmovaps 64(%rdi), %ymm5 -; AVX-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX-NEXT: vmovaps 224(%rdi), %ymm3 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] @@ -1041,13 +1040,14 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX-NEXT: vmovaps %ymm2, %ymm10 -; AVX-NEXT: vmovaps %ymm1, %ymm3 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] +; AVX-NEXT: vmovaps %ymm1, %ymm3 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 176(%rdi), %xmm6 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0] ; AVX-NEXT: vmovaps %xmm6, %xmm2 +; AVX-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX-NEXT: vmovaps 128(%rdi), %xmm6 @@ -1776,9 +1776,8 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 304(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1799,8 +1798,9 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] @@ -3515,17 +3515,14 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 144(%rdi), %xmm14 ; SSE-NEXT: movaps 176(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps 240(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 224(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm4 ; SSE-NEXT: movaps 112(%rdi), %xmm2 @@ -3535,10 +3532,13 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] +; SSE-NEXT: movaps 224(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm2, %xmm1 +; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 +; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; SSE-NEXT: movaps %xmm8, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] @@ -4673,19 +4673,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 704(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm1 -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -5166,19 +5166,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] @@ -5659,19 +5659,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm14 ; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm2 = [0,4,0,4,0,4,0,4] ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index dd94dffa85932..1238b1c097628 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -686,34 +686,34 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { ; SSE-LABEL: load_i32_stride5_vf8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa 144(%rdi), %xmm3 -; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa 80(%rdi), %xmm11 +; SSE-NEXT: movapd 80(%rdi), %xmm11 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm14 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,2,2] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, %xmm10 -; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: movapd %xmm11, %xmm10 +; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1] @@ -1293,24 +1293,23 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 272(%rdi), %xmm3 ; SSE-NEXT: movdqa 240(%rdi), %xmm14 ; SSE-NEXT: movdqa 256(%rdi), %xmm8 -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 208(%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa 32(%rdi), %xmm5 ; SSE-NEXT: movdqa 48(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movdqa 160(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 160(%rdi), %xmm10 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] -; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm9 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm1 @@ -1344,7 +1343,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1664,12 +1663,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i32_stride5_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -1685,6 +1683,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1782,12 +1781,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i32_stride5_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $72, %rsp -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -1803,6 +1801,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1900,12 +1899,11 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i32_stride5_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $72, %rsp -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -1921,6 +1919,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -2520,32 +2519,30 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride5_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 448(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movapd 128(%rdi), %xmm6 +; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm7 -; SSE-NEXT: movdqa 80(%rdi), %xmm12 -; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movdqa %xmm12, %xmm1 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm15 +; SSE-NEXT: movapd 80(%rdi), %xmm12 +; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -2564,9 +2561,9 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3355,17 +3352,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -3634,17 +3631,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -3913,17 +3910,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -5007,11 +5004,10 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 400(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 416(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5020,6 +5016,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] @@ -6745,27 +6742,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] @@ -7302,27 +7299,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] @@ -7859,27 +7856,27 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 8820dccc40bf4..864e41510030b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -1663,30 +1663,29 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $408, %rsp # imm = 0x198 ; SSE-NEXT: movdqa 240(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: movdqa 192(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: movdqa 336(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 352(%rdi), %xmm5 -; SSE-NEXT: movdqa 288(%rdi), %xmm15 -; SSE-NEXT: movdqa 304(%rdi), %xmm7 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa (%rdi), %xmm8 +; SSE-NEXT: movdqa 288(%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 +; SSE-NEXT: movapd (%rdi), %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa 256(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm1 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2052,6 +2051,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[3,1],ymm8[4,5],ymm4[7,5] ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm5[3,3] ; AVX-NEXT: vmovaps %ymm7, %ymm1 +; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] ; AVX-NEXT: vmovups (%rsp), %ymm7 # 32-byte Reload ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[3,1],ymm1[2,1],ymm7[7,5],ymm1[6,5] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1] @@ -2067,7 +2067,6 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3],ymm9[4,5,6,7] ; AVX-NEXT: vmovaps 32(%rdi), %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3] ; AVX-NEXT: vmovaps 16(%rdi), %xmm5 @@ -2145,13 +2144,13 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] @@ -2322,13 +2321,13 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm5 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] @@ -2502,12 +2501,12 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm7 @@ -3395,19 +3394,17 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 544(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 ; SSE-NEXT: movdqa 496(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movdqa 96(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 480(%rdi), %xmm8 +; SSE-NEXT: movdqa 112(%rdi), %xmm11 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index ed316990e4866..b268c4a984cc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -2018,21 +2018,21 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 240(%rdi), %xmm6 ; SSE-NEXT: movdqa 272(%rdi), %xmm5 ; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: movdqa 240(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa 192(%rdi), %xmm14 -; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 304(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -4210,30 +4210,30 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: subq $1160, %rsp # imm = 0x488 ; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 640(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 640(%rdi), %xmm3 ; SSE-NEXT: movdqa 608(%rdi), %xmm4 -; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movaps 128(%rdi), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] @@ -4490,8 +4490,8 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: movdqa 736(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[2,3,2,3] +; SSE-NEXT: movdqa 736(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] ; SSE-NEXT: movdqa %xmm2, %xmm12 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8551,18 +8551,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 640(%rdi), %xmm13 ; SSE-NEXT: movdqa 608(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 560(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 576(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: movdqa 112(%rdi), %xmm1 +; SSE-NEXT: movdqa 160(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -11204,13 +11204,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -12234,13 +12234,13 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -13262,22 +13262,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 +; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 2fd173c729170..9448acd134008 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -2426,14 +2426,14 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; AVX-NEXT: vmovaps 480(%rdi), %ymm9 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX-NEXT: vmovaps 480(%rdi), %ymm9 +; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] @@ -4401,9 +4401,8 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 608(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 672(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm4 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4413,19 +4412,20 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] @@ -9221,9 +9221,8 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9233,19 +9232,20 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] @@ -15784,7 +15784,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -15802,14 +15801,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -15818,11 +15816,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -15869,10 +15869,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -16381,7 +16381,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -16399,14 +16398,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -16415,11 +16413,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -16466,10 +16466,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -16978,7 +16978,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -16996,14 +16995,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -17012,11 +17010,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -17063,10 +17063,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -17575,7 +17575,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -17593,14 +17592,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -17609,11 +17607,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -17660,10 +17660,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -18172,7 +18172,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -18190,14 +18189,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -18206,11 +18204,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -18257,10 +18257,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -18769,7 +18769,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -18787,14 +18786,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -18803,11 +18801,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -18854,10 +18854,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -19366,7 +19366,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -19384,14 +19383,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -19400,11 +19398,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -19451,10 +19451,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm22 @@ -19963,7 +19963,6 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24 @@ -19981,14 +19980,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 @@ -19997,11 +19995,13 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 @@ -20048,10 +20048,10 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 6716d97b3f07c..9ed29fc54dbc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -1409,7 +1409,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm2 @@ -1419,9 +1418,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 @@ -1527,7 +1527,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-FP-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm2 @@ -1537,9 +1536,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 @@ -1645,7 +1645,6 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rdi), %ymm2, %ymm9 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm2 @@ -1655,9 +1654,10 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm7 +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 70164cff89072..1ef07aabc54c9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -798,18 +798,18 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i64_stride5_vf8: ; AVX: # %bb.0: ; AVX-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX-NEXT: vmovapd 224(%rdi), %ymm9 -; AVX-NEXT: vmovapd 96(%rdi), %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm2[6,7] ; AVX-NEXT: vmovapd 64(%rdi), %ymm7 -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3] -; AVX-NEXT: vmovapd (%rdi), %xmm10 +; AVX-NEXT: vmovaps (%rdi), %xmm10 +; AVX-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 -; AVX-NEXT: vmovapd 32(%rdi), %xmm4 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] -; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm4[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd %ymm0, %ymm3 ; AVX-NEXT: vmovapd 192(%rdi), %xmm5 @@ -849,9 +849,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] -; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1] -; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm15[2,3] +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3] ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] @@ -865,7 +865,7 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovapd %ymm11, 32(%r8) ; AVX-NEXT: vmovapd %ymm8, (%r8) ; AVX-NEXT: vmovapd %ymm0, 32(%r9) -; AVX-NEXT: vmovapd %ymm1, (%r9) +; AVX-NEXT: vmovaps %ymm1, (%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -10952,7 +10952,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -10966,6 +10965,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -11420,7 +11420,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -11434,6 +11433,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -11888,7 +11888,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -11902,6 +11901,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -12356,7 +12356,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -12370,6 +12369,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -12824,7 +12824,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -12838,6 +12837,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -13292,7 +13292,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -13306,6 +13305,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -13760,7 +13760,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -13774,6 +13773,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 @@ -14228,7 +14228,6 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm4 @@ -14242,6 +14241,7 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 21e1b17760c24..3dbd078504caa 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -919,13 +919,13 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps 16(%rdi), %xmm7 ; SSE-NEXT: movaps 48(%rdi), %xmm1 ; SSE-NEXT: movaps 144(%rdi), %xmm2 -; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm12 ; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps 288(%rdi), %xmm9 ; SSE-NEXT: movaps %xmm9, %xmm14 +; SSE-NEXT: movaps 288(%rdi), %xmm9 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] +; SSE-NEXT: movaps 96(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, %xmm4 @@ -2502,10 +2502,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-LABEL: load_i64_stride6_vf16: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2514,6 +2513,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] +; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2716,10 +2716,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-LABEL: load_i64_stride6_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2728,6 +2727,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2930,10 +2930,9 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-LABEL: load_i64_stride6_vf16: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2942,6 +2941,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm15 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 1d1da0954d675..16647d0da63c5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -9706,11 +9706,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-LABEL: load_i64_stride7_vf32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -9723,7 +9726,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -9735,20 +9737,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10180,11 +10179,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -10197,7 +10199,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -10209,20 +10210,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10654,11 +10652,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -10671,7 +10672,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -10683,20 +10683,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -11128,11 +11125,14 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32: ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -11145,7 +11145,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -11157,20 +11156,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -17266,25 +17262,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm19 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 @@ -17297,10 +17293,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -17308,6 +17303,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18265,25 +18261,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 @@ -18296,10 +18292,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -18307,6 +18302,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19264,25 +19260,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm19 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 @@ -19295,10 +19291,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -19306,6 +19301,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20263,25 +20259,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 +; AVX512DQ-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 @@ -20294,10 +20290,9 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -20305,6 +20300,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21261,15 +21257,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 +; AVX512BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21283,19 +21280,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 @@ -22243,15 +22239,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 +; AVX512BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22265,19 +22262,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 @@ -23225,15 +23221,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 +; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23247,19 +23244,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 @@ -24207,15 +24203,16 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24229,19 +24226,18 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index ceb4948726760..80f628099ee89 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -4046,13 +4046,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4077,15 +4075,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4099,19 +4098,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4269,13 +4269,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4300,15 +4298,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4322,19 +4321,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4492,13 +4492,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4523,15 +4521,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4545,19 +4544,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4715,13 +4715,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4746,15 +4744,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4768,19 +4767,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4938,13 +4938,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4969,15 +4967,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4991,19 +4990,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5161,13 +5161,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5192,15 +5190,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5214,19 +5213,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5384,13 +5384,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-BW-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5415,15 +5413,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5437,19 +5436,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5607,13 +5607,11 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovaps 512(%rdi), %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5638,15 +5636,16 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5660,19 +5659,20 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -8744,24 +8744,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512-NEXT: movb $-64, %al -; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -8804,6 +8804,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -8824,7 +8825,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8852,9 +8852,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -8867,7 +8868,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -8986,7 +8986,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -8998,6 +8997,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -9265,24 +9265,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -9325,6 +9325,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -9345,7 +9346,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9373,9 +9373,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -9388,7 +9389,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -9507,7 +9507,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -9519,6 +9518,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -9786,24 +9786,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-NEXT: movb $-64, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -9846,6 +9846,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -9866,7 +9867,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9894,9 +9894,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -9909,7 +9910,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -10028,7 +10028,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -10040,6 +10039,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -10307,24 +10307,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-FCP-NEXT: movb $-64, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -10367,6 +10367,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -10387,7 +10388,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10415,9 +10415,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -10430,7 +10431,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -10549,7 +10549,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -10561,6 +10560,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -10828,24 +10828,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -10888,6 +10888,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -10908,7 +10909,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10936,9 +10936,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -10951,7 +10952,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -11070,7 +11070,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -11082,6 +11081,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -11349,24 +11349,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512BW-FCP-NEXT: movb $-64, %al -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -11409,6 +11409,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -11429,7 +11430,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11457,9 +11457,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -11472,7 +11473,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -11591,7 +11591,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -11603,6 +11602,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -11870,24 +11870,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-BW-NEXT: movb $-64, %al -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -11930,6 +11930,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -11950,7 +11951,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11978,9 +11978,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -11993,7 +11994,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -12112,7 +12112,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -12124,6 +12123,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -12391,24 +12391,24 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: movb $-64, %al -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -12451,6 +12451,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} @@ -12471,7 +12472,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm10[1],ymm22[1],ymm10[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm6[1],ymm9[1],ymm6[3],ymm9[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],ymm5[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm18, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12499,9 +12499,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm14[1],ymm4[1],ymm14[3],ymm4[3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 @@ -12514,7 +12515,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm26, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm1[1],ymm30[1],ymm1[3],ymm30[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 @@ -12633,7 +12633,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -12645,6 +12644,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -18960,32 +18960,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: movb $-64, %al +; AVX512-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -19025,10 +19025,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -20031,32 +20031,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: movb $-64, %al +; AVX512-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -20096,10 +20096,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -21102,32 +21102,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: movb $-64, %al +; AVX512DQ-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -21167,10 +21167,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -22173,32 +22173,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: movb $-64, %al +; AVX512DQ-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -22238,10 +22238,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -23244,32 +23244,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: movb $-64, %al +; AVX512BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -23309,10 +23309,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -24315,32 +24315,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: movb $-64, %al +; AVX512BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -24380,10 +24380,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -25386,32 +25386,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: movb $-64, %al +; AVX512DQ-BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -25451,10 +25451,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm1 @@ -26457,32 +26457,32 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -26522,10 +26522,10 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm13[0],ymm21[0],ymm13[2],ymm21[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index faecad65c395b..429758c835065 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -1037,9 +1037,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm8 -; SSE-NEXT: movdqa 48(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] @@ -1053,8 +1052,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] @@ -1075,6 +1074,7 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; SSE-NEXT: movdqa %xmm8, %xmm10 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 15f6ef4006fdd..60fcf25b507b7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -1125,9 +1125,8 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm15 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 @@ -1135,6 +1134,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: packuswb %xmm0, %xmm7 ; SSE-NEXT: packuswb %xmm1, %xmm7 +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 @@ -1959,9 +1959,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride4_vf64: ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm15 +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa 144(%rdi), %xmm7 @@ -1978,8 +1977,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 +; SSE-NEXT: packuswb %xmm13, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -1989,9 +1988,10 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm0 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index 43a45b9fd59a7..c215e2dd9f4d9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -2125,10 +2125,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 @@ -4083,12 +4083,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 @@ -4790,10 +4790,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm15 ; SSE-NEXT: pand %xmm8, %xmm13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index e4dc257543d20..c7b73198c7f4d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -1437,6 +1437,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm0, %xmm5 ; SSE-NEXT: por %xmm11, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3] +; SSE-NEXT: pandn %xmm3, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7] @@ -1446,7 +1447,6 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: por %xmm9, %xmm0 -; SSE-NEXT: pandn %xmm3, %xmm12 ; SSE-NEXT: por %xmm12, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0] ; SSE-NEXT: pand %xmm1, %xmm9 @@ -2586,8 +2586,9 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -2673,7 +2674,6 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 160(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm14, %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm12 @@ -4661,11 +4661,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] @@ -4674,6 +4673,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 @@ -4881,6 +4881,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa 112(%rdi), %xmm6 +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 @@ -4891,7 +4892,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pandn %xmm13, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm1 @@ -4926,6 +4926,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 @@ -4945,8 +4947,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 @@ -6046,8 +6046,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm1 ; AVX-NEXT: vmovdqa %xmm2, %xmm8 -; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vmovdqa %xmm14, %xmm13 +; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2 ; AVX-NEXT: vmovdqa %xmm3, %xmm14 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1 @@ -6435,9 +6435,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm7 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1] ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7] +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0] ; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5 ; AVX-NEXT: vmovaps %ymm6, %ymm0 ; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 130ae31b37bfe..48ef75742eccb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1087,7 +1087,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 @@ -1097,6 +1096,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] @@ -1863,7 +1863,6 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride7_vf16: ; SSE: # %bb.0: ; SSE-NEXT: subq $168, %rsp -; SSE-NEXT: movdqa 96(%rdi), %xmm15 ; SSE-NEXT: movdqa 80(%rdi), %xmm4 ; SSE-NEXT: movdqa 64(%rdi), %xmm7 ; SSE-NEXT: movdqa (%rdi), %xmm6 @@ -1898,6 +1897,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] @@ -3619,17 +3619,16 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movdqa 208(%rdi), %xmm14 -; SSE-NEXT: movdqa 192(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 @@ -3647,6 +3646,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] +; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -3835,10 +3835,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm7 ; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: por %xmm2, %xmm3 @@ -3942,11 +3942,11 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: movdqa %xmm12, %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -4074,10 +4074,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] +; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] ; SSE-NEXT: pand %xmm3, %xmm10 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: packuswb %xmm5, %xmm0 @@ -4118,10 +4118,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] ; SSE-NEXT: packuswb %xmm2, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7] @@ -7220,29 +7220,29 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 @@ -7855,13 +7855,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm12, %xmm2 -; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm9, %xmm4 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 @@ -9479,12 +9479,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm14 ; AVX-NEXT: vpor %xmm5, %xmm14, %xmm5 -; AVX-NEXT: vmovdqa %xmm12, %xmm14 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm13, %xmm9 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm2 +; AVX-NEXT: vmovdqa %xmm12, %xmm14 +; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb %xmm13, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm1, %xmm12 ; AVX-NEXT: vpor %xmm5, %xmm2, %xmm1 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 @@ -9936,8 +9936,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: # xmm9 = mem[0,0] ; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX-NEXT: vmovdqa %xmm0, %xmm11 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10 +; AVX-NEXT: vmovdqa %xmm0, %xmm11 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 @@ -10148,22 +10148,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: subq $760, %rsp # imm = 0x2F8 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -10687,22 +10687,22 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $760, %rsp # imm = 0x2F8 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] +; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -11225,10 +11225,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-LABEL: load_i8_stride7_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 @@ -11241,6 +11240,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -13521,7 +13521,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -13574,6 +13573,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -13894,7 +13894,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 -; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -13914,6 +13913,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 @@ -14138,13 +14138,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 -; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 +; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -14254,7 +14254,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -14307,6 +14306,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -14624,7 +14624,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 -; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -14644,6 +14643,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 @@ -14868,13 +14868,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3 -; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 +; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index b1eb4d6af4eb7..72be7f0399fd5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -3792,13 +3792,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-LABEL: load_i8_stride8_vf32: ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 128(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm14 @@ -3812,8 +3810,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -3824,16 +3822,18 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa 64(%rdi), %xmm5 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm15 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -8619,15 +8619,15 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 208(%rdi), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 @@ -8647,10 +8647,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -13908,7 +13908,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm7, %xmm24 ; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm2 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -13921,6 +13920,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX512-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512-NEXT: vmovdqa 432(%rdi), %xmm11 ; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm2 @@ -14004,8 +14004,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -14172,6 +14171,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm20 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -14245,7 +14245,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] ; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm9 -; AVX512-NEXT: vmovdqa64 %xmm25, %xmm20 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] @@ -14312,6 +14311,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] @@ -14324,7 +14324,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm30 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm13 @@ -14838,7 +14837,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -14856,8 +14854,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -14868,9 +14866,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 @@ -15224,7 +15223,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9] ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm24 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm8, %xmm2 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm21 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -15237,6 +15235,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm13 ; AVX512DQ-NEXT: vmovdqa 432(%rdi), %xmm11 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2 @@ -15320,8 +15319,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10] -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm13 -; AVX512DQ-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm10 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -15488,6 +15486,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm20 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] @@ -15561,7 +15560,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7] ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm9 -; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm20 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3] @@ -15628,6 +15626,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm9 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] @@ -15640,7 +15639,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm30 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm13 @@ -16154,7 +16152,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -16172,8 +16169,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 +; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -16184,9 +16181,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 @@ -16446,7 +16444,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512BW-NEXT: vmovdqa 496(%rdi), %xmm4 @@ -16462,22 +16459,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vmovdqa 448(%rdi), %xmm7 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm30 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm30 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512BW-NEXT: vpmovqb %ymm4, %xmm4 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512BW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX512BW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vmovdqa64 352(%rdi), %xmm27 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] @@ -16852,6 +16850,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] @@ -16879,7 +16878,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpsrlq $32, %zmm17, %zmm3 ; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3 @@ -17326,19 +17324,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] +; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} @@ -17573,7 +17571,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0 ; AVX512DQ-BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2 ; AVX512DQ-BW-NEXT: vmovdqa 496(%rdi), %xmm4 @@ -17589,22 +17586,23 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm26 +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm30 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm30 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpmovqb %ymm4, %xmm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; AVX512DQ-BW-NEXT: vmovdqa 368(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovaps 368(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %xmm27 ; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] @@ -17979,6 +17977,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] @@ -18006,7 +18005,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3 @@ -18453,19 +18451,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 9d1939f66219f..9fd7862fdc368 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1168,7 +1168,6 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rsi), %xmm7 ; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa 48(%rsi), %xmm11 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 ; SSE-NEXT: movdqa 48(%rdx), %xmm12 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] @@ -1184,10 +1183,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: movdqa 32(%rdx), %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -2809,7 +2809,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-LABEL: store_i16_stride3_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] @@ -2830,6 +2829,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] @@ -2953,7 +2953,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-LABEL: store_i16_stride3_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm1 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm3 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9] @@ -2974,6 +2973,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] @@ -3107,16 +3107,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] @@ -3139,8 +3136,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3155,6 +3152,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3165,10 +3163,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3259,16 +3259,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] @@ -3291,8 +3288,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3307,6 +3304,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3317,10 +3315,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3411,16 +3411,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] @@ -3443,8 +3440,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3459,6 +3456,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3469,10 +3467,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3563,16 +3563,13 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] @@ -3595,8 +3592,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 +; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3611,6 +3608,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3621,10 +3619,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index 704c92924abfb..a52d6cc9bd3b7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -3074,20 +3074,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride4_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 @@ -3163,11 +3161,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3228,20 +3228,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-FCP-LABEL: store_i16_stride4_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 @@ -3317,11 +3315,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-FCP-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3382,20 +3382,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-LABEL: store_i16_stride4_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 @@ -3471,11 +3469,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQ-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512DQ-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3536,20 +3536,18 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512DQ-FCP-LABEL: store_i16_stride4_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 @@ -3625,11 +3623,13 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 7d2f52d3c5830..994c785126d25 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -1157,12 +1157,11 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 ; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm14 ; SSE-NEXT: movdqa 16(%rsi), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa (%rcx), %xmm14 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rcx), %xmm11 ; SSE-NEXT: movdqa 16(%r8), %xmm3 @@ -1179,6 +1178,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm9, %xmm4 +; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm13 @@ -2489,10 +2489,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] @@ -2836,12 +2836,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-LABEL: store_i16_stride5_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: subq $72, %rsp -; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 ; AVX2-NEXT: vmovdqa (%r8), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rdx), %xmm6 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 @@ -2852,15 +2851,19 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm11 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -2870,15 +2873,12 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm0 ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm12 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] ; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] @@ -2894,7 +2894,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,2,6,7,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10],ymm0[11],ymm7[12,13],ymm0[14],ymm7[15] -; AVX2-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10 ; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] @@ -2907,6 +2906,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] +; AVX2-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] @@ -3566,7 +3566,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 @@ -3623,6 +3622,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] +; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] @@ -3880,7 +3880,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpbroadcastq 8(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13] ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3 @@ -3937,6 +3936,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] +; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] @@ -4583,13 +4583,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm12 +; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm14 @@ -4978,10 +4978,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] +; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index c725dcd972cd5..b2a270adbb359 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -1711,19 +1711,19 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-LABEL: store_i16_stride6_vf16: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $24, %rsp -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 +; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm7 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] @@ -2037,15 +2037,16 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] +; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm16 ; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11 @@ -2059,7 +2060,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] @@ -2204,15 +2204,16 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm16 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] +; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm16 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11 @@ -2226,7 +2227,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] @@ -2418,11 +2418,11 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa (%rcx), %xmm4 ; SSE-NEXT: movdqa 16(%rcx), %xmm10 ; SSE-NEXT: movdqa (%r8), %xmm8 -; SSE-NEXT: movdqa (%r9), %xmm11 ; SSE-NEXT: movdqa %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm9 +; SSE-NEXT: movdqa (%r9), %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] @@ -5188,11 +5188,11 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa (%rcx), %xmm6 ; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm7 +; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] @@ -5269,6 +5269,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa 32(%rdi), %xmm3 ; SSE-NEXT: movdqa 32(%rsi), %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm7 @@ -5291,7 +5292,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm14, %xmm7 ; SSE-NEXT: orps %xmm7, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] @@ -5312,6 +5312,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa 48(%rdi), %xmm3 ; SSE-NEXT: movdqa 48(%rsi), %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm8 @@ -5332,7 +5333,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm14, %xmm8 ; SSE-NEXT: orps %xmm8, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] @@ -5353,6 +5353,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa 64(%rdi), %xmm3 ; SSE-NEXT: movdqa 64(%rsi), %xmm8 ; SSE-NEXT: movdqa %xmm3, %xmm11 @@ -5373,7 +5374,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm14, %xmm11 ; SSE-NEXT: orps %xmm11, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7] @@ -5394,6 +5394,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa 80(%rdi), %xmm3 ; SSE-NEXT: movdqa 80(%rsi), %xmm11 ; SSE-NEXT: movdqa %xmm3, %xmm12 @@ -5414,7 +5415,6 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: andps %xmm14, %xmm12 ; SSE-NEXT: orps %xmm12, %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] @@ -6619,17 +6619,17 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa %xmm2, %xmm14 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vmovdqa (%rdi), %xmm10 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,2,1] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vmovdqa (%r8), %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vmovdqa 32(%r8), %xmm4 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovdqa (%r9), %xmm0 @@ -9244,23 +9244,23 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm7 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm5 +; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index dc362d729fcd3..1d1c4de793b6d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -2288,6 +2288,8 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-NEXT: vmovdqa (%rdx), %ymm5 ; AVX2-NEXT: vmovdqa (%rcx), %ymm13 @@ -2295,8 +2297,6 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%r9), %ymm2 ; AVX2-NEXT: vmovdqa (%rax), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vmovdqa %ymm6, %ymm7 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] @@ -3565,27 +3565,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm1 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rcx), %xmm5 ; SSE-NEXT: movdqa 48(%r8), %xmm9 -; SSE-NEXT: movdqa 48(%r9), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 48(%rax), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: movaps 48(%rax), %xmm7 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] +; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm11 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 @@ -4798,11 +4798,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%rcx), %ymm14 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9 -; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] +; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 @@ -4814,7 +4815,6 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] @@ -7719,32 +7719,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 96(%rdx), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm1 -; SSE-NEXT: movdqa 96(%rcx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rcx), %xmm6 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa 112(%r8), %xmm4 -; SSE-NEXT: movdqa 112(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm1, %xmm10 ; SSE-NEXT: movdqa %xmm13, %xmm1 +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] -; SSE-NEXT: movdqa %xmm15, %xmm1 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: movdqa 96(%rcx), %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] +; SSE-NEXT: movdqa 112(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE-NEXT: movdqa 96(%rdx), %xmm5 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, %xmm9 @@ -7755,7 +7754,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm15[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm4 ; SSE-NEXT: andnps %xmm7, %xmm1 @@ -8135,15 +8134,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: movdqa %xmm6, %xmm0 +; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -8847,7 +8846,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 @@ -8884,6 +8882,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: movaps %xmm10, %xmm13 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 @@ -9023,7 +9022,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm13, %xmm7 +; SSE-NEXT: movaps %xmm13, %xmm7 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[1],mem[0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -9647,13 +9646,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovdqa 48(%r9), %xmm1 ; AVX-NEXT: vmovdqa 48(%r8), %xmm2 ; AVX-NEXT: vmovdqa 48(%rax), %xmm11 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX-NEXT: vmovdqa %xmm2, %xmm9 -; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa %xmm1, %xmm10 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm7, %xmm2 -; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7] ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3] @@ -9790,14 +9788,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7 ; AVX-NEXT: vmovdqa 80(%r9), %xmm0 ; AVX-NEXT: vmovdqa 80(%r8), %xmm2 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; AVX-NEXT: vmovdqa %xmm0, %xmm4 +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; AVX-NEXT: vmovdqa %xmm2, %xmm3 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vmovdqa %xmm0, %xmm4 -; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa 80(%rax), %xmm2 -; AVX-NEXT: vmovdqa %xmm6, %xmm0 -; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7] ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3] @@ -15369,8 +15366,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -15595,8 +15592,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -15821,8 +15818,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -16047,8 +16044,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} +; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index e333e47219116..22508e2ccfc79 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -1321,14 +1321,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-LABEL: store_i32_stride2_vf64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm12 @@ -1393,14 +1393,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-FP-LABEL: store_i32_stride2_vf64: ; AVX2-FP: # %bb.0: -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm12 @@ -1465,14 +1465,14 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; ; AVX2-FCP-LABEL: store_i32_stride2_vf64: ; AVX2-FCP: # %bb.0: -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index de2e1df4c5566..7d636b2d8aa3b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -714,13 +714,13 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 16(%rsi), %xmm9 ; SSE-NEXT: movaps 32(%rsi), %xmm10 ; SSE-NEXT: movaps 48(%rsi), %xmm11 -; SSE-NEXT: movaps 16(%rdx), %xmm0 -; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movaps 48(%rdx), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] ; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 58991d65cf1ee..ede8586545e49 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -1334,11 +1334,9 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX-LABEL: store_i32_stride5_vf16: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %ymm0 +; AVX-NEXT: vmovaps (%rsi), %xmm6 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX-NEXT: vmovaps 32(%rsi), %ymm3 -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX-NEXT: vmovaps (%rsi), %xmm6 ; AVX-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX-NEXT: vmovaps (%rdi), %xmm10 ; AVX-NEXT: vmovaps 32(%rdi), %xmm9 @@ -1346,11 +1344,12 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX-NEXT: vmovaps (%rdx), %xmm11 -; AVX-NEXT: vmovaps (%rcx), %xmm12 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] +; AVX-NEXT: vmovaps (%rcx), %xmm12 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX-NEXT: vbroadcastss 4(%rdx), %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 +; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] ; AVX-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] @@ -1368,6 +1367,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX-NEXT: vbroadcastss 4(%rcx), %xmm12 @@ -4774,11 +4774,10 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps 32(%rcx), %xmm12 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps 32(%r8), %xmm14 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] @@ -4789,8 +4788,9 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] +; SSE-NEXT: movaps 32(%rcx), %xmm12 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5310,11 +5310,11 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[2,0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: movaps %xmm5, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index e4f616ed730eb..31d7791d674a4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -10090,48 +10090,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -10659,48 +10658,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -11228,48 +11226,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -11797,48 +11794,47 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] +; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] -; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 13930bc2c6740..275f36005f1ee 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -1779,24 +1779,24 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $520, %rsp # imm = 0x208 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdx), %xmm5 +; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps 16(%r8), %xmm14 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%rcx), %xmm8 +; SSE-NEXT: movaps (%r8), %xmm15 +; SSE-NEXT: movdqa (%r9), %xmm13 ; SSE-NEXT: movaps 16(%rcx), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm13 ; SSE-NEXT: movdqa 16(%r9), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm11 +; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] @@ -1917,11 +1917,11 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm15[0] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload @@ -4015,36 +4015,35 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa (%rsi), %xmm10 -; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movaps (%rdx), %xmm14 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm9 ; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm15 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm8, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4065,9 +4064,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm5 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 @@ -4119,9 +4118,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -6798,8 +6797,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7001,8 +7000,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7204,8 +7203,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7407,8 +7406,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7610,8 +7609,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7813,8 +7812,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8016,8 +8015,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8219,8 +8218,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} +; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8381,23 +8380,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE: # %bb.0: ; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE-NEXT: movdqa (%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 -; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdx), %xmm2 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm3 ; SSE-NEXT: movaps (%rcx), %xmm13 ; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%r8), %xmm10 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%r9), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm15 @@ -8434,9 +8433,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 -; SSE-NEXT: movaps 32(%rdx), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 @@ -8487,9 +8486,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 -; SSE-NEXT: movaps 64(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -8540,9 +8539,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm1 -; SSE-NEXT: movaps 96(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -8592,9 +8591,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm1 -; SSE-NEXT: movaps 128(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -8645,9 +8644,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm1 -; SSE-NEXT: movaps 160(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -8698,9 +8697,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm1 -; SSE-NEXT: movaps 192(%rdx), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -9646,14 +9645,14 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX: # %bb.0: ; AVX-NEXT: subq $3432, %rsp # imm = 0xD68 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX-NEXT: vmovaps 224(%rdi), %ymm1 +; AVX-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX-NEXT: vmovaps 224(%rsi), %ymm2 +; AVX-NEXT: vmovaps 224(%r8), %ymm4 +; AVX-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%rdx), %ymm0 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rcx), %ymm5 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 224(%r8), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rax), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index 265f6daeb2003..bac4ff8ce434d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -864,7 +864,6 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movaps (%r10), %xmm14 ; SSE-NEXT: movaps 16(%r10), %xmm12 ; SSE-NEXT: movaps (%rax), %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: movaps %xmm4, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0] @@ -872,10 +871,11 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm4 +; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index a01d4de0027f4..75b76f891d46c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -843,12 +843,12 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $24, %rsp ; SSE-NEXT: movapd 64(%rdi), %xmm5 ; SSE-NEXT: movapd (%rdi), %xmm1 -; SSE-NEXT: movapd 16(%rdi), %xmm2 -; SSE-NEXT: movapd 32(%rdi), %xmm3 -; SSE-NEXT: movapd 48(%rdi), %xmm6 -; SSE-NEXT: movapd 64(%rsi), %xmm9 ; SSE-NEXT: movapd (%rsi), %xmm4 ; SSE-NEXT: movapd 16(%rsi), %xmm7 +; SSE-NEXT: movapd 64(%rsi), %xmm9 +; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: movapd 32(%rdi), %xmm3 +; SSE-NEXT: movapd 16(%rdi), %xmm2 ; SSE-NEXT: movapd 32(%rsi), %xmm11 ; SSE-NEXT: movapd 48(%rsi), %xmm10 ; SSE-NEXT: movapd 64(%rdx), %xmm15 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index ded7c002c8735..d610029880f81 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -6745,15 +6745,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -7040,15 +7040,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -7335,15 +7335,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -7630,15 +7630,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -7925,15 +7925,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -8220,15 +8220,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -8515,15 +8515,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 @@ -8810,15 +8810,15 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdx), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index ffdbdea024ea0..78a8042b3535e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -7394,20 +7394,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i64_stride5_vf64: ; AVX: # %bb.0: ; AVX-NEXT: subq $2264, %rsp # imm = 0x8D8 -; AVX-NEXT: vmovaps 192(%rdi), %ymm14 -; AVX-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX-NEXT: vmovaps (%rcx), %ymm2 +; AVX-NEXT: vmovaps 128(%rcx), %ymm0 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 16(%rdx), %xmm6 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] +; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] +; AVX-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 89642492f83a8..651f851f9f6f9 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -4443,7 +4443,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 -; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -4458,6 +4457,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 +; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} @@ -4468,10 +4468,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 -; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} @@ -4481,7 +4481,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 -; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 @@ -4501,6 +4500,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 +; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 @@ -5267,7 +5267,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 -; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -5282,6 +5281,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 +; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} @@ -5292,10 +5292,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 -; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 +; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} @@ -5305,7 +5305,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 -; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 @@ -5325,6 +5324,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 +; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 @@ -16932,58 +16932,58 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512-NEXT: kmovw %r10d, %k1 +; AVX512-NEXT: movb $96, %r10b ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: movb $96, %r10b -; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqa (%r9), %ymm7 ; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512-NEXT: vmovdqa (%r9), %ymm7 ; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512-NEXT: movb $28, %r10b +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: kmovw %r10d, %k2 +; AVX512-NEXT: movb $28, %r10b +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] @@ -17894,48 +17894,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $96, %r10b -; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512-FCP-NEXT: kmovw %r10d, %k1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 @@ -17947,7 +17943,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -18854,36 +18853,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm14 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: movb $96, %r10b +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] @@ -18892,18 +18890,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 @@ -19808,37 +19807,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-FCP-NEXT: vmovaps (%rdx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] @@ -19847,18 +19845,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -20761,58 +20760,58 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm15 +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: movb $96, %r10b ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm5 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: movb $96, %r10b -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm5 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512BW-NEXT: movb $28, %r10b +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: movb $28, %r10b +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] @@ -21723,48 +21722,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $96, %r10b -; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm10 @@ -21776,7 +21771,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -22683,36 +22681,35 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm14 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: movb $96, %r10b +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] @@ -22721,18 +22718,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 @@ -23637,37 +23635,36 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] @@ -23676,18 +23673,19 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index e837f14d367b2..5c005567db232 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -18632,16 +18632,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-NEXT: movb $-64, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -18676,18 +18674,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -19579,16 +19579,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-FCP-NEXT: movb $-64, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -19623,18 +19621,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -20526,16 +20526,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-NEXT: movb $-64, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -20570,18 +20568,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -21473,16 +21473,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -21517,18 +21515,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -22420,16 +22420,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -22464,18 +22462,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -23367,16 +23367,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-FCP-NEXT: movb $-64, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -23411,18 +23409,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -24314,16 +24314,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-NEXT: movb $-64, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -24358,18 +24356,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -25261,16 +25261,14 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [4,12,4,12,4,12,4,12] @@ -25305,18 +25303,20 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index 3d26171054f2e..c1e99368e9201 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -1645,7 +1645,6 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%rdx), %xmm3 -; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm0 @@ -1660,6 +1659,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm15 ; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm15 ; AVX-NEXT: vpor %xmm6, %xmm15, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 06d390f053c7e..5e87572af5dc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -981,7 +981,6 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa (%rsi), %xmm8 ; SSE-NEXT: movdqa (%rdx), %xmm2 ; SSE-NEXT: movdqa (%rcx), %xmm4 -; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] @@ -990,13 +989,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm6, %xmm5 +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm12 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] @@ -1661,7 +1661,6 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdx), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: movdqa 16(%rcx), %xmm12 ; SSE-NEXT: movdqa 16(%r8), %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] @@ -1680,8 +1679,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] @@ -1691,6 +1690,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 60af864597f4f..fbeecbc0a4ab2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1767,10 +1767,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] -; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm1 @@ -2231,11 +2231,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-LABEL: store_i8_stride6_vf32: ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] @@ -2244,6 +2243,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqa (%rdx), %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm9 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] +; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-NEXT: vmovdqa (%rsi), %xmm11 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -2365,15 +2365,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-LABEL: store_i8_stride6_vf32: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $40, %rsp -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm7 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2501,15 +2501,15 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-LABEL: store_i8_stride6_vf32: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $40, %rsp -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5451,10 +5451,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX512-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -5836,9 +5836,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512-FCP-NEXT: vpternlogq $226, %zmm7, %zmm9, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] @@ -5980,10 +5980,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -6365,9 +6365,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm7, %zmm9, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 416fbe9aa340c..14e5f65407942 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -2683,16 +2683,16 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 16(%rdi), %xmm1 ; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rdx), %xmm3 -; SSE-NEXT: movdqa 16(%rcx), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm6 -; SSE-NEXT: movdqa 16(%r9), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm0 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: movdqa %xmm4, %xmm8 ; SSE-NEXT: movdqa %xmm4, %xmm13 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3129,10 +3129,10 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,1,3] +; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] ; SSE-NEXT: pand %xmm2, %xmm3 @@ -4304,7 +4304,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 @@ -4341,6 +4340,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero @@ -4583,7 +4583,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u],zero,xmm8[7],zero,xmm8[5,u,u,u],zero,xmm8[8],zero,xmm8[6,u,u,u],zero ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 @@ -4620,6 +4619,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero @@ -5328,9 +5328,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 48(%rax), %xmm13 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] -; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] @@ -5928,12 +5928,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm0, %xmm10 +; SSE-NEXT: pand %xmm6, %xmm10 ; SSE-NEXT: por %xmm10, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm15, %xmm2 @@ -5942,9 +5941,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: pandn %xmm10, %xmm15 -; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] @@ -6311,7 +6310,6 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm12, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 ; SSE-NEXT: pand %xmm12, %xmm6 -; SSE-NEXT: movdqa %xmm12, %xmm4 ; SSE-NEXT: por %xmm6, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm2 @@ -6323,13 +6321,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,0,2,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm11, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm3, %xmm10 @@ -6378,17 +6375,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1] ; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm14 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255] ; SSE-NEXT: movdqa %xmm12, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,4] -; SSE-NEXT: movdqa %xmm4, %xmm14 -; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm4, %xmm10 +; SSE-NEXT: pand %xmm14, %xmm10 ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: movdqa %xmm12, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 311166ef60dda..39b012bcf8d4e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -3024,7 +3024,6 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovaps 16(%rsi), %xmm8 ; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm9[1],ymm15[2],ymm9[3],ymm15[4,5,6,7,8],ymm9[9],ymm15[10],ymm9[11],ymm15[12,13,14,15] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,2,1,4,4,6,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2],ymm15[3],ymm0[4],ymm15[5],ymm0[6],ymm15[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3036,13 +3035,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7] +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 ; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 @@ -3052,12 +3051,13 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7] diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll index ea4a339d9d5e8..6441a83a4e326 100644 --- a/llvm/test/CodeGen/X86/vector-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -238,14 +238,14 @@ define void @b(ptr %p, ptr %q) nounwind { ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rdi), %xmm2 -; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movaps (%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rdi), %xmm2 ; CHECK-NEXT: movaps 32(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rsi), %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index dc9e69137a8a7..c58c4614022c6 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -3320,27 +3320,24 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; SSE2: # %bb.0: ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_4i1_to_4i64: ; SSSE3: # %bb.0: ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_4i1_to_4i64: ; SSE41: # %bb.0: ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; SSE41-NEXT: movdqa %xmm2, %xmm0 +; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_4i1_to_4i64: @@ -3370,18 +3367,16 @@ define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pslld $31, %xmm0 ; X86-SSE2-NEXT: psrad $31, %xmm0 -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X86-SSE2-NEXT: retl ; ; X86-SSE41-LABEL: sext_4i1_to_4i64: ; X86-SSE41: # %bb.0: ; X86-SSE41-NEXT: pslld $31, %xmm0 ; X86-SSE41-NEXT: psrad $31, %xmm0 -; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 ; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] -; X86-SSE41-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 ; X86-SSE41-NEXT: retl %extmask = sext <4 x i1> %mask to <4 x i64> ret <4 x i64> %extmask diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 81ce14132c879..d6171235aa2c4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -685,11 +685,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: vmovaps 8(%ebp), %ymm5 ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm5[2,3] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] -; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] ; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; X86-AVX1-NEXT: vmovaps %ymm4, %ymm1 ; X86-AVX1-NEXT: movl %ebp, %esp @@ -714,10 +714,10 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; X86-AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm6[0] ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] ; X86-AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; X86-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; X86-AVX2-NEXT: vmovaps %ymm5, %ymm0 ; X86-AVX2-NEXT: vmovaps %ymm4, %ymm1 @@ -739,11 +739,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] -; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] ; X64-AVX1-NEXT: vmovaps %ymm4, %ymm1 ; X64-AVX1-NEXT: vmovaps %ymm5, %ymm3 @@ -754,22 +754,21 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] ; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] -; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] ; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] +; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm3[0] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; X64-AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; X64-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] ; X64-AVX2-NEXT: vmovaps %ymm6, %ymm0 ; X64-AVX2-NEXT: vmovaps %ymm4, %ymm1 -; X64-AVX2-NEXT: vmovaps %ymm5, %ymm3 ; X64-AVX2-NEXT: retq %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> %v1 = shufflevector <16 x i64> %v0, <16 x i64> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 74926f46ffa43..90b6beeae516d 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -127,10 +127,10 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq @@ -584,10 +584,10 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq @@ -881,10 +881,10 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm2, %ymm0 ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index f84131dfc8797..35b90a4b2205f 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -3220,8 +3220,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 49947eddc61b9..bf330de825966 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1275,17 +1275,17 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-LABEL: interleaved_load_vf64_i8_stride3: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqu (%rdi), %xmm11 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 ; AVX1-NEXT: vmovups 64(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 -; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 ; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 +; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 +; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 -; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 ; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 ; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 2169b39b9dfa0..150385ffd8aa8 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -63,9 +63,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: imulb %sil -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: smuloi8: @@ -83,9 +82,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: imulb %dl -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: smuloi8: @@ -93,9 +91,8 @@ define zeroext i1 @smuloi8(i8 %v1, i8 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: imulb {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 @@ -211,14 +208,14 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: subl $8, %esp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: addl %eax, %ecx @@ -290,9 +287,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: mulb %sil -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi8: @@ -310,9 +306,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mulb %dl -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi8: @@ -320,9 +315,8 @@ define zeroext i1 @umuloi8(i8 %v1, i8 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mulb {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2) %val = extractvalue {i8, i1} %t, 0 @@ -338,9 +332,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $ax killed $ax killed $eax ; SDAG-NEXT: mulw %si -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movw %ax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi16: @@ -359,9 +352,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mulw %dx -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movw %ax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi16: @@ -370,9 +362,8 @@ define zeroext i1 @umuloi16(i16 %v1, i16 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mulw {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movw %ax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2) @@ -388,9 +379,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: mull %esi -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movl %eax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi32: @@ -408,9 +398,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mull %edx -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movl %eax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi32: @@ -419,9 +408,8 @@ define zeroext i1 @umuloi32(i32 %v1, i32 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movl %eax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) @@ -437,9 +425,8 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movq %rdi, %rax ; SDAG-NEXT: mulq %rsi -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movq %rax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi64: @@ -457,9 +444,8 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq %rdx -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movq %rax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi64: @@ -985,14 +971,14 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: pushl %esi ; WIN32-NEXT: pushl %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax -; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx +; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi ; WIN32-NEXT: movl %edx, %ecx ; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %eax, %ecx @@ -1399,9 +1385,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; SDAG-NEXT: movl %esi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: imulb (%rdi) -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: smuloi8_load: @@ -1418,9 +1403,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %edx, %eax ; WIN64-NEXT: imulb (%rcx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: smuloi8_load: @@ -1429,9 +1413,8 @@ define zeroext i1 @smuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movzbl (%eax), %eax ; WIN32-NEXT: imulb {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %v1 = load i8, ptr %ptr1 %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2) @@ -1447,9 +1430,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: imulb (%rsi) -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: smuloi8_load2: @@ -1467,9 +1449,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: imulb (%rdx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: smuloi8_load2: @@ -1478,9 +1459,8 @@ define zeroext i1 @smuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: imulb (%ecx) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %v2 = load i8, ptr %ptr2 %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2) @@ -1869,9 +1849,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; SDAG-NEXT: movl %esi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: mulb (%rdi) -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi8_load: @@ -1888,9 +1867,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %edx, %eax ; WIN64-NEXT: mulb (%rcx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi8_load: @@ -1899,9 +1877,8 @@ define zeroext i1 @umuloi8_load(ptr %ptr1, i8 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movzbl (%eax), %eax ; WIN32-NEXT: mulb {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %v1 = load i8, ptr %ptr1 %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2) @@ -1917,9 +1894,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $al killed $al killed $eax ; SDAG-NEXT: mulb (%rsi) -; SDAG-NEXT: seto %cl ; SDAG-NEXT: movb %al, (%rdx) -; SDAG-NEXT: movl %ecx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi8_load2: @@ -1937,9 +1913,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mulb (%rdx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movb %al, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi8_load2: @@ -1948,9 +1923,8 @@ define zeroext i1 @umuloi8_load2(i8 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mulb (%ecx) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movb %al, (%edx) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: retl %v2 = load i8, ptr %ptr2 %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2) @@ -1967,9 +1941,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) { ; SDAG-NEXT: movl %esi, %eax ; SDAG-NEXT: # kill: def $ax killed $ax killed $eax ; SDAG-NEXT: mulw (%rdi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movw %ax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi16_load: @@ -1987,9 +1960,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %edx, %eax ; WIN64-NEXT: mulw (%rcx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movw %ax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi16_load: @@ -1999,9 +1971,8 @@ define zeroext i1 @umuloi16_load(ptr %ptr1, i16 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movzwl (%eax), %eax ; WIN32-NEXT: mulw {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movw %ax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %v1 = load i16, ptr %ptr1 @@ -2019,9 +1990,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) { ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: # kill: def $ax killed $ax killed $eax ; SDAG-NEXT: mulw (%rsi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movw %ax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi16_load2: @@ -2040,9 +2010,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mulw (%rdx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movw %ax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi16_load2: @@ -2052,9 +2021,8 @@ define zeroext i1 @umuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mulw (%ecx) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movw %ax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %v2 = load i16, ptr %ptr2 @@ -2071,9 +2039,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movl %esi, %eax ; SDAG-NEXT: mull (%rdi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movl %eax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi32_load: @@ -2091,9 +2058,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %edx, %eax ; WIN64-NEXT: mull (%rcx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movl %eax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi32_load: @@ -2103,9 +2069,8 @@ define zeroext i1 @umuloi32_load(ptr %ptr1, i32 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl (%eax), %eax ; WIN32-NEXT: mull {{[0-9]+}}(%esp) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movl %eax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %v1 = load i32, ptr %ptr1 @@ -2122,9 +2087,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movl %edi, %eax ; SDAG-NEXT: mull (%rsi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movl %eax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi32_load2: @@ -2142,9 +2106,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movl %ecx, %eax ; WIN64-NEXT: mull (%rdx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movl %eax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi32_load2: @@ -2154,9 +2117,8 @@ define zeroext i1 @umuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN32-NEXT: mull (%ecx) -; WIN32-NEXT: seto %cl ; WIN32-NEXT: movl %eax, (%esi) -; WIN32-NEXT: movl %ecx, %eax +; WIN32-NEXT: seto %al ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl %v2 = load i32, ptr %ptr2 @@ -2173,9 +2135,8 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movq %rsi, %rax ; SDAG-NEXT: mulq (%rdi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movq %rax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi64_load: @@ -2193,9 +2154,8 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movq %rdx, %rax ; WIN64-NEXT: mulq (%rcx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movq %rax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi64_load: @@ -2250,9 +2210,8 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; SDAG-NEXT: movq %rdx, %rcx ; SDAG-NEXT: movq %rdi, %rax ; SDAG-NEXT: mulq (%rsi) -; SDAG-NEXT: seto %dl ; SDAG-NEXT: movq %rax, (%rcx) -; SDAG-NEXT: movl %edx, %eax +; SDAG-NEXT: seto %al ; SDAG-NEXT: retq ; ; FAST-LABEL: umuloi64_load2: @@ -2270,9 +2229,8 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) { ; WIN64: # %bb.0: ; WIN64-NEXT: movq %rcx, %rax ; WIN64-NEXT: mulq (%rdx) -; WIN64-NEXT: seto %cl ; WIN64-NEXT: movq %rax, (%r8) -; WIN64-NEXT: movl %ecx, %eax +; WIN64-NEXT: seto %al ; WIN64-NEXT: retq ; ; WIN32-LABEL: umuloi64_load2: From ccd00cf0935ae4d469737557d1b79d4484595842 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Wed, 21 Aug 2024 22:02:17 +0200 Subject: [PATCH 02/15] Only move instructions if a potential copy would be possible --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 19 +- llvm/test/CodeGen/RISCV/llvm.frexp.ll | 12 +- .../test/CodeGen/RISCV/overflow-intrinsics.ll | 10 +- llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll | 3 +- .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll | 2603 ----------------- .../rvv/fixed-vectors-deinterleave-load.ll | 3 +- .../rvv/fixed-vectors-reduction-int-vp.ll | 24 +- .../RISCV/rvv/vector-deinterleave-fixed.ll | 3 +- .../CodeGen/RISCV/rvv/vector-deinterleave.ll | 2 +- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 3 +- 10 files changed, 40 insertions(+), 2642 deletions(-) delete mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 493d7cd7d8c92..7c5bf9b5c3fac 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -121,7 +121,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs { } }; -static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, +static std::optional> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src, ScheduleDAGMCP &DG) { MachineInstr *DstInstr = Dst->getInstr(); @@ -129,7 +129,7 @@ static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, MachineBasicBlock *MBB = SrcInstr->getParent(); if (DstInstr == nullptr || SrcInstr == nullptr) - return false; + return {}; assert("This function only operates on a basic block level." && MBB == SrcInstr->getParent()); @@ -199,19 +199,20 @@ static bool moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, const auto *Current = Edges.front(); Edges.pop(); if (!ProcessSNodeChildren(Edges, Current, false)) - return false; + return {}; } // If all of the dependencies were deemed valid during the BFS then we // are moving them before the copy source here keeping their relative // order to each other. + llvm::SmallVector InstructionsToMove; auto CurrentInst = SrcInstr->getIterator(); for (int I = 0; I < SectionSize; I++) { if (SectionInstr[I]) - MBB->splice(SrcInstr->getIterator(), MBB, CurrentInst->getIterator()); + InstructionsToMove.push_back(&(*CurrentInst)); ++CurrentInst; } - return true; + return InstructionsToMove; } static std::optional isCopyInstr(const MachineInstr &MI, @@ -1161,6 +1162,7 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies()) return; + std::optional> InstructionsToMove = {}; for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd; ++OpIdx) { MachineOperand &MODef = MI.getOperand(OpIdx); @@ -1202,7 +1204,8 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, SUnit *DstSUnit = DG.getSUnit(Copy); SUnit *SrcSUnit = DG.getSUnit(&MI); - if (!moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG)) + InstructionsToMove = moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG); + if (!InstructionsToMove) continue; } @@ -1234,6 +1237,10 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, MaybeDeadCopies.insert(Copy); Changed = true; ++NumCopyBackwardPropagated; + } else if (InstructionsToMove) { + for (auto *I : *InstructionsToMove) { + MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator()); + } } } } diff --git a/llvm/test/CodeGen/RISCV/llvm.frexp.ll b/llvm/test/CodeGen/RISCV/llvm.frexp.ll index 23c885a1d2cb6..eed4e030f6720 100644 --- a/llvm/test/CodeGen/RISCV/llvm.frexp.ll +++ b/llvm/test/CodeGen/RISCV/llvm.frexp.ll @@ -641,8 +641,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: mv a2, a1 +; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -691,8 +691,8 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: mv a2, a1 +; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: mv a1, sp ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf @@ -923,8 +923,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: mv a2, a1 +; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -965,8 +965,8 @@ define <4 x float> @test_frexp_v4f32_v4i32_only_use_fract(<4 x float> %a) nounwi ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: mv a2, a1 +; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: mv a1, sp ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf @@ -1171,8 +1171,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV32IZFINXZDINX-NEXT: mv s0, a4 ; RV32IZFINXZDINX-NEXT: mv s1, a3 ; RV32IZFINXZDINX-NEXT: mv s2, a2 -; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: mv a2, a1 +; RV32IZFINXZDINX-NEXT: mv s3, a0 ; RV32IZFINXZDINX-NEXT: addi a1, sp, 12 ; RV32IZFINXZDINX-NEXT: mv a0, a2 ; RV32IZFINXZDINX-NEXT: call frexpf @@ -1212,8 +1212,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; RV64IZFINXZDINX-NEXT: mv s0, a4 ; RV64IZFINXZDINX-NEXT: mv s1, a3 ; RV64IZFINXZDINX-NEXT: mv s2, a2 -; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: mv a2, a1 +; RV64IZFINXZDINX-NEXT: mv s3, a0 ; RV64IZFINXZDINX-NEXT: addi a1, sp, 8 ; RV64IZFINXZDINX-NEXT: mv a0, a2 ; RV64IZFINXZDINX-NEXT: call frexpf diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index 67143336de477..5efcb623aa209 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -687,12 +687,11 @@ define i1 @uaddo_i64_decrement_alt(i64 %x, ptr %p) { ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 ; RV32-NEXT: seqz a4, a0 -; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: snez a0, a3 +; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: mv a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo_i64_decrement_alt: @@ -715,12 +714,11 @@ define i1 @uaddo_i64_decrement_alt_dom(i64 %x, ptr %p) { ; RV32: # %bb.0: ; RV32-NEXT: or a3, a0, a1 ; RV32-NEXT: seqz a4, a0 -; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: addi a0, a0, -1 -; RV32-NEXT: snez a3, a3 ; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: snez a0, a3 +; RV32-NEXT: sub a1, a1, a4 ; RV32-NEXT: sw a1, 4(a2) -; RV32-NEXT: mv a0, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo_i64_decrement_alt_dom: diff --git a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll index 5fe5a0eda46b8..c74cd48d75a65 100644 --- a/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/rv32zbb-zbkb.ll @@ -156,9 +156,8 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: sll a0, a0, a2 ; CHECK-NEXT: srli a4, a4, 1 ; CHECK-NEXT: srl a1, a4, a6 -; CHECK-NEXT: or a3, a5, a3 ; CHECK-NEXT: or a1, a0, a1 -; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: or a0, a5, a3 ; CHECK-NEXT: ret %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b) ret i64 %or diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll deleted file mode 100644 index 15aa11670e126..0000000000000 --- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll +++ /dev/null @@ -1,2603 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs \ -; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64 -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs \ -; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZBA -; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zicond -verify-machineinstrs \ -; RUN: -riscv-experimental-rv64-legal-i32 | FileCheck %s --check-prefix=RV64ZICOND - -; -; Get the actual value of the overflow bit. -; -define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: saddo1.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a3, a0, a1 -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: xor a3, a1, a3 -; RV64-NEXT: snez a0, a3 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo1.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a3, a0, a1 -; RV64ZBA-NEXT: add a1, a0, a1 -; RV64ZBA-NEXT: xor a3, a1, a3 -; RV64ZBA-NEXT: snez a0, a3 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo1.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a3, a0, a1 -; RV64ZICOND-NEXT: add a1, a0, a1 -; RV64ZICOND-NEXT: xor a3, a1, a3 -; RV64ZICOND-NEXT: snez a0, a3 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -; Test the immediate version. -define zeroext i1 @saddo2.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: saddo2.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, 4 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo2.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, 4 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo2.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, 4 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -; Test negative immediates. -define zeroext i1 @saddo3.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: saddo3.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, -4 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo3.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, -4 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo3.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, -4 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -; Test immediates that are too large to be encoded. -define zeroext i1 @saddo4.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: saddo4.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: lui a2, 4096 -; RV64-NEXT: addi a2, a2, -1 -; RV64-NEXT: addw a2, a0, a2 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo4.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: lui a2, 4096 -; RV64ZBA-NEXT: addi a2, a2, -1 -; RV64ZBA-NEXT: addw a2, a0, a2 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo4.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: lui a2, 4096 -; RV64ZICOND-NEXT: addi a2, a2, -1 -; RV64ZICOND-NEXT: addw a2, a0, a2 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: saddo1.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a3, a0, a1 -; RV64-NEXT: slt a0, a3, a0 -; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: xor a0, a1, a0 -; RV64-NEXT: sd a3, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo1.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a3, a0, a1 -; RV64ZBA-NEXT: slt a0, a3, a0 -; RV64ZBA-NEXT: slti a1, a1, 0 -; RV64ZBA-NEXT: xor a0, a1, a0 -; RV64ZBA-NEXT: sd a3, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo1.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a3, a0, a1 -; RV64ZICOND-NEXT: slt a0, a3, a0 -; RV64ZICOND-NEXT: slti a1, a1, 0 -; RV64ZICOND-NEXT: xor a0, a1, a0 -; RV64ZICOND-NEXT: sd a3, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) { -; RV64-LABEL: saddo2.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, 4 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo2.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, 4 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo2.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, 4 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) { -; RV64-LABEL: saddo3.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, -4 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo3.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, -4 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo3.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, -4 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: uaddo.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a1, a0, a1 -; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a1, a0 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a1, a0 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) { -; RV64-LABEL: uaddo.i32.constant: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, -2 -; RV64-NEXT: sltu a0, a2, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i32.constant: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, -2 -; RV64ZBA-NEXT: sltu a0, a2, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i32.constant: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, -2 -; RV64ZICOND-NEXT: sltu a0, a2, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i32.constant_one(i32 signext %v1, ptr %res) { -; RV64-LABEL: uaddo.i32.constant_one: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, 1 -; RV64-NEXT: seqz a0, a2 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i32.constant_one: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, 1 -; RV64ZBA-NEXT: seqz a0, a2 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i32.constant_one: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, 1 -; RV64ZICOND-NEXT: seqz a0, a2 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: uaddo.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: sd a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a1, a0 -; RV64ZBA-NEXT: sd a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a1, a0 -; RV64ZICOND-NEXT: sd a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i64.constant_one(i64 %v1, ptr %res) { -; RV64-LABEL: uaddo.i64.constant_one: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, 1 -; RV64-NEXT: seqz a0, a2 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64.constant_one: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, 1 -; RV64ZBA-NEXT: seqz a0, a2 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64.constant_one: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, 1 -; RV64ZICOND-NEXT: seqz a0, a2 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: ssubo1.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a3, a0, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: xor a3, a1, a3 -; RV64-NEXT: snez a0, a3 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo1.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a3, a0, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: xor a3, a1, a3 -; RV64ZBA-NEXT: snez a0, a3 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo1.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a3, a0, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: xor a3, a1, a3 -; RV64ZICOND-NEXT: snez a0, a3 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @ssubo2.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: ssubo2.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, 4 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo2.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, 4 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo2.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, 4 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: ssubo.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a3, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 -; RV64-NEXT: xor a0, a3, a0 -; RV64-NEXT: sd a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a3, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 -; RV64ZBA-NEXT: xor a0, a3, a0 -; RV64ZBA-NEXT: sd a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a3, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 -; RV64ZICOND-NEXT: xor a0, a3, a0 -; RV64ZICOND-NEXT: sd a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: usubo.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a0, a1 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a0, a1 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) { -; RV64-LABEL: usubo.i32.constant.rhs: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addiw a2, a0, 2 -; RV64-NEXT: sltu a0, a0, a2 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.i32.constant.rhs: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addiw a2, a0, 2 -; RV64ZBA-NEXT: sltu a0, a0, a2 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.i32.constant.rhs: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addiw a2, a0, 2 -; RV64ZICOND-NEXT: sltu a0, a0, a2 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) { -; RV64-LABEL: usubo.i32.constant.lhs: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a2, -2 -; RV64-NEXT: subw a2, a2, a0 -; RV64-NEXT: addi a0, a2, 1 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.i32.constant.lhs: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: li a2, -2 -; RV64ZBA-NEXT: subw a2, a2, a0 -; RV64ZBA-NEXT: addi a0, a2, 1 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.i32.constant.lhs: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a2, -2 -; RV64ZICOND-NEXT: subw a2, a2, a0 -; RV64ZICOND-NEXT: addi a0, a2, 1 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: usubo.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: sd a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a0, a1 -; RV64ZBA-NEXT: sd a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a0, a1 -; RV64ZICOND-NEXT: sd a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @smulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: smulo.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulw a3, a0, a1 -; RV64-NEXT: mul a1, a0, a1 -; RV64-NEXT: xor a3, a1, a3 -; RV64-NEXT: snez a0, a3 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulw a3, a0, a1 -; RV64ZBA-NEXT: mul a1, a0, a1 -; RV64ZBA-NEXT: xor a3, a1, a3 -; RV64ZBA-NEXT: snez a0, a3 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulw a3, a0, a1 -; RV64ZICOND-NEXT: mul a1, a0, a1 -; RV64ZICOND-NEXT: xor a3, a1, a3 -; RV64ZICOND-NEXT: snez a0, a3 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: smulo2.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a2, 13 -; RV64-NEXT: mulw a3, a0, a2 -; RV64-NEXT: mul a2, a0, a2 -; RV64-NEXT: xor a3, a2, a3 -; RV64-NEXT: snez a0, a3 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo2.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sh1add a2, a0, a0 -; RV64ZBA-NEXT: sh2add a2, a2, a0 -; RV64ZBA-NEXT: sext.w a0, a2 -; RV64ZBA-NEXT: xor a0, a2, a0 -; RV64ZBA-NEXT: snez a0, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo2.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a2, 13 -; RV64ZICOND-NEXT: mulw a3, a0, a2 -; RV64ZICOND-NEXT: mul a2, a0, a2 -; RV64ZICOND-NEXT: xor a3, a2, a3 -; RV64ZICOND-NEXT: snez a0, a3 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: smulo.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulh a3, a0, a1 -; RV64-NEXT: mul a1, a0, a1 -; RV64-NEXT: srai a0, a1, 63 -; RV64-NEXT: xor a0, a3, a0 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: sd a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulh a3, a0, a1 -; RV64ZBA-NEXT: mul a1, a0, a1 -; RV64ZBA-NEXT: srai a0, a1, 63 -; RV64ZBA-NEXT: xor a0, a3, a0 -; RV64ZBA-NEXT: snez a0, a0 -; RV64ZBA-NEXT: sd a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulh a3, a0, a1 -; RV64ZICOND-NEXT: mul a1, a0, a1 -; RV64ZICOND-NEXT: srai a0, a1, 63 -; RV64ZICOND-NEXT: xor a0, a3, a0 -; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: sd a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) { -; RV64-LABEL: smulo2.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a2, 13 -; RV64-NEXT: mulh a3, a0, a2 -; RV64-NEXT: mul a2, a0, a2 -; RV64-NEXT: srai a0, a2, 63 -; RV64-NEXT: xor a0, a3, a0 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo2.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: li a2, 13 -; RV64ZBA-NEXT: mulh a2, a0, a2 -; RV64ZBA-NEXT: sh1add a3, a0, a0 -; RV64ZBA-NEXT: sh2add a3, a3, a0 -; RV64ZBA-NEXT: srai a0, a3, 63 -; RV64ZBA-NEXT: xor a0, a2, a0 -; RV64ZBA-NEXT: snez a0, a0 -; RV64ZBA-NEXT: sd a3, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo2.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a2, 13 -; RV64ZICOND-NEXT: mulh a3, a0, a2 -; RV64ZICOND-NEXT: mul a2, a0, a2 -; RV64ZICOND-NEXT: srai a0, a2, 63 -; RV64ZICOND-NEXT: xor a0, a3, a0 -; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) { -; RV64-LABEL: umulo.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: mulhu a1, a0, a1 -; RV64-NEXT: srai a0, a1, 32 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: zext.w a1, a1 -; RV64ZBA-NEXT: zext.w a0, a0 -; RV64ZBA-NEXT: mul a1, a0, a1 -; RV64ZBA-NEXT: srai a0, a1, 32 -; RV64ZBA-NEXT: snez a0, a0 -; RV64ZBA-NEXT: sw a1, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: slli a1, a1, 32 -; RV64ZICOND-NEXT: slli a0, a0, 32 -; RV64ZICOND-NEXT: mulhu a1, a0, a1 -; RV64ZICOND-NEXT: srai a0, a1, 32 -; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) { -; RV64-LABEL: umulo2.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a2, 13 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: mulhu a2, a0, a2 -; RV64-NEXT: srli a0, a2, 32 -; RV64-NEXT: snez a0, a0 -; RV64-NEXT: sw a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo2.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: zext.w a2, a0 -; RV64ZBA-NEXT: sh1add.uw a0, a0, a2 -; RV64ZBA-NEXT: sh2add a2, a0, a2 -; RV64ZBA-NEXT: srli a0, a2, 32 -; RV64ZBA-NEXT: snez a0, a0 -; RV64ZBA-NEXT: sw a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo2.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a2, 13 -; RV64ZICOND-NEXT: slli a2, a2, 32 -; RV64ZICOND-NEXT: slli a0, a0, 32 -; RV64ZICOND-NEXT: mulhu a2, a0, a2 -; RV64ZICOND-NEXT: srli a0, a2, 32 -; RV64ZICOND-NEXT: snez a0, a0 -; RV64ZICOND-NEXT: sw a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - store i32 %val, ptr %res - ret i1 %obit -} - -; Similar to umulo.i32, but storing the overflow and returning the result. -define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) { -; RV64-LABEL: umulo3.i32: -; RV64: # %bb.0: -; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: srai a1, a0, 32 -; RV64-NEXT: snez a1, a1 -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: sw a1, 0(a2) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo3.i32: -; RV64ZBA: # %bb.0: -; RV64ZBA-NEXT: zext.w a1, a1 -; RV64ZBA-NEXT: zext.w a0, a0 -; RV64ZBA-NEXT: mul a3, a0, a1 -; RV64ZBA-NEXT: srai a3, a3, 32 -; RV64ZBA-NEXT: snez a3, a3 -; RV64ZBA-NEXT: mulw a0, a0, a1 -; RV64ZBA-NEXT: sw a3, 0(a2) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo3.i32: -; RV64ZICOND: # %bb.0: -; RV64ZICOND-NEXT: slli a1, a1, 32 -; RV64ZICOND-NEXT: slli a0, a0, 32 -; RV64ZICOND-NEXT: mulhu a0, a0, a1 -; RV64ZICOND-NEXT: srai a1, a0, 32 -; RV64ZICOND-NEXT: snez a1, a1 -; RV64ZICOND-NEXT: sext.w a0, a0 -; RV64ZICOND-NEXT: sw a1, 0(a2) -; RV64ZICOND-NEXT: ret - %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1) - %5 = extractvalue { i32, i1 } %4, 1 - %6 = extractvalue { i32, i1 } %4, 0 - %7 = zext i1 %5 to i32 - store i32 %7, ptr %2, align 4 - ret i32 %6 -} - -define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) { -; RV64-LABEL: umulo.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulhu a3, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: sd a0, 0(a2) -; RV64-NEXT: snez a0, a3 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulhu a3, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: sd a0, 0(a2) -; RV64ZBA-NEXT: snez a0, a3 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulhu a3, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: sd a0, 0(a2) -; RV64ZICOND-NEXT: snez a0, a3 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) { -; RV64-LABEL: umulo2.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a3, 13 -; RV64-NEXT: mulhu a2, a0, a3 -; RV64-NEXT: mul a0, a0, a3 -; RV64-NEXT: sd a0, 0(a1) -; RV64-NEXT: snez a0, a2 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo2.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: li a2, 13 -; RV64ZBA-NEXT: mulhu a2, a0, a2 -; RV64ZBA-NEXT: sh1add a3, a0, a0 -; RV64ZBA-NEXT: sh2add a0, a3, a0 -; RV64ZBA-NEXT: sd a0, 0(a1) -; RV64ZBA-NEXT: snez a0, a2 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo2.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a3, 13 -; RV64ZICOND-NEXT: mulhu a2, a0, a3 -; RV64ZICOND-NEXT: mul a0, a0, a3 -; RV64ZICOND-NEXT: sd a0, 0(a1) -; RV64ZICOND-NEXT: snez a0, a2 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - - -; -; Check the use of the overflow bit in combination with a select instruction. -; -define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: saddo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a2, a0, a1 -; RV64-NEXT: add a3, a0, a1 -; RV64-NEXT: bne a3, a2, .LBB28_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB28_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a2, a0, a1 -; RV64ZBA-NEXT: add a3, a0, a1 -; RV64ZBA-NEXT: bne a3, a2, .LBB28_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB28_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a2, a0, a1 -; RV64ZICOND-NEXT: add a3, a0, a1 -; RV64ZICOND-NEXT: xor a2, a3, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: saddo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a2, a0, a1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: xor a0, a0, a2 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a2, a0, a1 -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: xor a0, a0, a2 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a2, a0, a1 -; RV64ZICOND-NEXT: add a0, a0, a1 -; RV64ZICOND-NEXT: xor a0, a0, a2 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @saddo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: saddo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: slt a2, a2, a0 -; RV64-NEXT: slti a3, a1, 0 -; RV64-NEXT: bne a3, a2, .LBB30_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB30_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a2, a0, a1 -; RV64ZBA-NEXT: slt a2, a2, a0 -; RV64ZBA-NEXT: slti a3, a1, 0 -; RV64ZBA-NEXT: bne a3, a2, .LBB30_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB30_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a2, a0, a1 -; RV64ZICOND-NEXT: slt a2, a2, a0 -; RV64ZICOND-NEXT: slti a3, a1, 0 -; RV64ZICOND-NEXT: xor a2, a3, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @saddo.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: saddo.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: xor a0, a1, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a2, a0, a1 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: slti a1, a1, 0 -; RV64ZBA-NEXT: xor a0, a1, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a2, a0, a1 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: slti a1, a1, 0 -; RV64ZICOND-NEXT: xor a0, a1, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: uaddo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a2, a0, a1 -; RV64-NEXT: bltu a2, a0, .LBB32_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB32_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a2, a0, a1 -; RV64ZBA-NEXT: bltu a2, a0, .LBB32_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB32_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a2, a0, a1 -; RV64ZICOND-NEXT: sltu a2, a2, a0 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: uaddo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a1, a0, a1 -; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a1, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a1, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: uaddo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: bltu a2, a0, .LBB34_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB34_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a2, a0, a1 -; RV64ZBA-NEXT: bltu a2, a0, .LBB34_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB34_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a2, a0, a1 -; RV64ZICOND-NEXT: sltu a2, a2, a0 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: uaddo.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: sltu a0, a1, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a1, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a1, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: ssubo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a2, a0, a1 -; RV64-NEXT: sub a3, a0, a1 -; RV64-NEXT: bne a3, a2, .LBB36_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB36_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a2, a0, a1 -; RV64ZBA-NEXT: sub a3, a0, a1 -; RV64ZBA-NEXT: bne a3, a2, .LBB36_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB36_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a2, a0, a1 -; RV64ZICOND-NEXT: sub a3, a0, a1 -; RV64ZICOND-NEXT: xor a2, a3, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: ssubo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a2, a0, a1 -; RV64-NEXT: sub a0, a0, a1 -; RV64-NEXT: xor a0, a0, a2 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a2, a0, a1 -; RV64ZBA-NEXT: sub a0, a0, a1 -; RV64ZBA-NEXT: xor a0, a0, a2 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a2, a0, a1 -; RV64ZICOND-NEXT: sub a0, a0, a1 -; RV64ZICOND-NEXT: xor a0, a0, a2 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: ssubo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 -; RV64-NEXT: sub a3, a0, a1 -; RV64-NEXT: slt a3, a3, a0 -; RV64-NEXT: bne a2, a3, .LBB38_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB38_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 -; RV64ZBA-NEXT: sub a3, a0, a1 -; RV64ZBA-NEXT: slt a3, a3, a0 -; RV64ZBA-NEXT: bne a2, a3, .LBB38_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB38_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 -; RV64ZICOND-NEXT: sub a3, a0, a1 -; RV64ZICOND-NEXT: slt a3, a3, a0 -; RV64ZICOND-NEXT: xor a2, a2, a3 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @ssub.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: ssub.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 -; RV64-NEXT: xor a0, a2, a0 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssub.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 -; RV64ZBA-NEXT: xor a0, a2, a0 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssub.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 -; RV64ZICOND-NEXT: xor a0, a2, a0 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: usubo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a2, a0, a1 -; RV64-NEXT: bltu a0, a2, .LBB40_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB40_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a2, a0, a1 -; RV64ZBA-NEXT: bltu a0, a2, .LBB40_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB40_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a2, a0, a1 -; RV64ZICOND-NEXT: sltu a2, a0, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: usubo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a0, a1 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a0, a1 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @usubo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: usubo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sub a2, a0, a1 -; RV64-NEXT: bltu a0, a2, .LBB42_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB42_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sub a2, a0, a1 -; RV64ZBA-NEXT: bltu a0, a2, .LBB42_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB42_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sub a2, a0, a1 -; RV64ZICOND-NEXT: sltu a2, a0, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @usubo.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: usubo.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: sltu a0, a0, a1 -; RV64-NEXT: xori a0, a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: sltu a0, a0, a1 -; RV64ZBA-NEXT: xori a0, a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: sltu a0, a0, a1 -; RV64ZICOND-NEXT: xori a0, a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i32 @smulo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: smulo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulw a2, a0, a1 -; RV64-NEXT: mul a3, a0, a1 -; RV64-NEXT: bne a3, a2, .LBB44_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB44_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulw a2, a0, a1 -; RV64ZBA-NEXT: mul a3, a0, a1 -; RV64ZBA-NEXT: bne a3, a2, .LBB44_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB44_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulw a2, a0, a1 -; RV64ZICOND-NEXT: mul a3, a0, a1 -; RV64ZICOND-NEXT: xor a2, a3, a2 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: smulo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulw a2, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: xor a0, a0, a2 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulw a2, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: xor a0, a0, a2 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulw a2, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: xor a0, a0, a2 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @smulo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: smulo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulh a2, a0, a1 -; RV64-NEXT: mul a3, a0, a1 -; RV64-NEXT: srai a3, a3, 63 -; RV64-NEXT: bne a2, a3, .LBB46_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB46_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulh a2, a0, a1 -; RV64ZBA-NEXT: mul a3, a0, a1 -; RV64ZBA-NEXT: srai a3, a3, 63 -; RV64ZBA-NEXT: bne a2, a3, .LBB46_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB46_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulh a2, a0, a1 -; RV64ZICOND-NEXT: mul a3, a0, a1 -; RV64ZICOND-NEXT: srai a3, a3, 63 -; RV64ZICOND-NEXT: xor a2, a2, a3 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @smulo.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: smulo.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulh a2, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: xor a0, a2, a0 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulh a2, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: srai a0, a0, 63 -; RV64ZBA-NEXT: xor a0, a2, a0 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulh a2, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: srai a0, a0, 63 -; RV64ZICOND-NEXT: xor a0, a2, a0 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i32 @umulo.select.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: umulo.select.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: slli a3, a0, 32 -; RV64-NEXT: mulhu a2, a3, a2 -; RV64-NEXT: srai a2, a2, 32 -; RV64-NEXT: bnez a2, .LBB48_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB48_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.select.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: zext.w a2, a1 -; RV64ZBA-NEXT: zext.w a3, a0 -; RV64ZBA-NEXT: mul a2, a3, a2 -; RV64ZBA-NEXT: srai a2, a2, 32 -; RV64ZBA-NEXT: bnez a2, .LBB48_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB48_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.select.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: slli a2, a1, 32 -; RV64ZICOND-NEXT: slli a3, a0, 32 -; RV64ZICOND-NEXT: mulhu a2, a3, a2 -; RV64ZICOND-NEXT: srai a2, a2, 32 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = select i1 %obit, i32 %v1, i32 %v2 - ret i32 %ret -} - -define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: umulo.not.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: srai a0, a0, 32 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.not.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: zext.w a1, a1 -; RV64ZBA-NEXT: zext.w a0, a0 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: srai a0, a0, 32 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.not.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: slli a1, a1, 32 -; RV64ZICOND-NEXT: slli a0, a0, 32 -; RV64ZICOND-NEXT: mulhu a0, a0, a1 -; RV64ZICOND-NEXT: srai a0, a0, 32 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) - %obit = extractvalue {i32, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - -define i64 @umulo.select.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: umulo.select.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulhu a2, a0, a1 -; RV64-NEXT: bnez a2, .LBB50_2 -; RV64-NEXT: # %bb.1: # %entry -; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB50_2: # %entry -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.select.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulhu a2, a0, a1 -; RV64ZBA-NEXT: bnez a2, .LBB50_2 -; RV64ZBA-NEXT: # %bb.1: # %entry -; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB50_2: # %entry -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.select.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulhu a2, a0, a1 -; RV64ZICOND-NEXT: czero.nez a1, a1, a2 -; RV64ZICOND-NEXT: czero.eqz a0, a0, a2 -; RV64ZICOND-NEXT: or a0, a0, a1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = select i1 %obit, i64 %v1, i64 %v2 - ret i64 %ret -} - -define i1 @umulo.not.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: umulo.not.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: seqz a0, a0 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.not.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulhu a0, a0, a1 -; RV64ZBA-NEXT: seqz a0, a0 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.not.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulhu a0, a0, a1 -; RV64ZICOND-NEXT: seqz a0, a0 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) - %obit = extractvalue {i64, i1} %t, 1 - %ret = xor i1 %obit, true - ret i1 %ret -} - - -; -; Check the use of the overflow bit in combination with a branch instruction. -; -define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: saddo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a2, a0, a1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB52_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB52_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a2, a0, a1 -; RV64ZBA-NEXT: add a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB52_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB52_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a2, a0, a1 -; RV64ZICOND-NEXT: add a0, a0, a1 -; RV64ZICOND-NEXT: beq a0, a2, .LBB52_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB52_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: saddo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: slt a0, a2, a0 -; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: beq a1, a0, .LBB53_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB53_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: saddo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a2, a0, a1 -; RV64ZBA-NEXT: slt a0, a2, a0 -; RV64ZBA-NEXT: slti a1, a1, 0 -; RV64ZBA-NEXT: beq a1, a0, .LBB53_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB53_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: saddo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a2, a0, a1 -; RV64ZICOND-NEXT: slt a0, a2, a0 -; RV64ZICOND-NEXT: slti a1, a1, 0 -; RV64ZICOND-NEXT: beq a1, a0, .LBB53_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB53_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { -; RV64-LABEL: uaddo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addw a1, a0, a1 -; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: bgeu a1, a0, .LBB54_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB54_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addw a1, a0, a1 -; RV64ZBA-NEXT: sext.w a0, a0 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB54_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB54_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addw a1, a0, a1 -; RV64ZICOND-NEXT: sext.w a0, a0 -; RV64ZICOND-NEXT: bgeu a1, a0, .LBB54_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB54_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: uaddo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: bgeu a1, a0, .LBB55_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB55_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a1, a0, a1 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB55_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB55_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a1, a0, a1 -; RV64ZICOND-NEXT: bgeu a1, a0, .LBB55_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB55_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: ssubo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a2, a0, a1 -; RV64-NEXT: sub a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB56_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB56_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a2, a0, a1 -; RV64ZBA-NEXT: sub a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB56_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB56_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a2, a0, a1 -; RV64ZICOND-NEXT: sub a0, a0, a1 -; RV64ZICOND-NEXT: beq a0, a2, .LBB56_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB56_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: ssubo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sgtz a2, a1 -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: slt a0, a1, a0 -; RV64-NEXT: beq a2, a0, .LBB57_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB57_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: ssubo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sgtz a2, a1 -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: slt a0, a1, a0 -; RV64ZBA-NEXT: beq a2, a0, .LBB57_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB57_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: ssubo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sgtz a2, a1 -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: slt a0, a1, a0 -; RV64ZICOND-NEXT: beq a2, a0, .LBB57_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB57_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: usubo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: subw a1, a0, a1 -; RV64-NEXT: bgeu a0, a1, .LBB58_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB58_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: subw a1, a0, a1 -; RV64ZBA-NEXT: bgeu a0, a1, .LBB58_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB58_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: subw a1, a0, a1 -; RV64ZICOND-NEXT: bgeu a0, a1, .LBB58_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB58_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: usubo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: bgeu a0, a1, .LBB59_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB59_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: usubo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: bgeu a0, a1, .LBB59_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB59_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: usubo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: sub a1, a0, a1 -; RV64ZICOND-NEXT: bgeu a0, a1, .LBB59_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB59_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @smulo.br.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: smulo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulw a2, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB60_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB60_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulw a2, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB60_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB60_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulw a2, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: beq a0, a2, .LBB60_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB60_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: smulo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulh a2, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: beq a2, a0, .LBB61_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB61_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulh a2, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: srai a0, a0, 63 -; RV64ZBA-NEXT: beq a2, a0, .LBB61_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB61_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulh a2, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: srai a0, a0, 63 -; RV64ZICOND-NEXT: beq a2, a0, .LBB61_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB61_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @smulo2.br.i64(i64 %v1) { -; RV64-LABEL: smulo2.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: li a1, -13 -; RV64-NEXT: mulh a2, a0, a1 -; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: beq a2, a0, .LBB62_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB62_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: smulo2.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: li a1, -13 -; RV64ZBA-NEXT: mulh a2, a0, a1 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: srai a0, a0, 63 -; RV64ZBA-NEXT: beq a2, a0, .LBB62_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB62_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: smulo2.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: li a1, -13 -; RV64ZICOND-NEXT: mulh a2, a0, a1 -; RV64ZICOND-NEXT: mul a0, a0, a1 -; RV64ZICOND-NEXT: srai a0, a0, 63 -; RV64ZICOND-NEXT: beq a2, a0, .LBB62_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB62_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @umulo.br.i32(i32 signext %v1, i32 signext %v2) { -; RV64-LABEL: umulo.br.i32: -; RV64: # %bb.0: # %entry -; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: srai a0, a0, 32 -; RV64-NEXT: beqz a0, .LBB63_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB63_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.br.i32: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: zext.w a1, a1 -; RV64ZBA-NEXT: zext.w a0, a0 -; RV64ZBA-NEXT: mul a0, a0, a1 -; RV64ZBA-NEXT: srai a0, a0, 32 -; RV64ZBA-NEXT: beqz a0, .LBB63_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB63_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.br.i32: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: slli a1, a1, 32 -; RV64ZICOND-NEXT: slli a0, a0, 32 -; RV64ZICOND-NEXT: mulhu a0, a0, a1 -; RV64ZICOND-NEXT: srai a0, a0, 32 -; RV64ZICOND-NEXT: beqz a0, .LBB63_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB63_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) - %val = extractvalue {i32, i1} %t, 0 - %obit = extractvalue {i32, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { -; RV64-LABEL: umulo.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: beqz a0, .LBB64_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB64_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: mulhu a0, a0, a1 -; RV64ZBA-NEXT: beqz a0, .LBB64_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB64_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: mulhu a0, a0, a1 -; RV64ZICOND-NEXT: beqz a0, .LBB64_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB64_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @umulo2.br.i64(i64 %v1) { -; RV64-LABEL: umulo2.br.i64: -; RV64: # %bb.0: # %entry -; RV64-NEXT: add a1, a0, a0 -; RV64-NEXT: bgeu a1, a0, .LBB65_2 -; RV64-NEXT: # %bb.1: # %overflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: ret -; RV64-NEXT: .LBB65_2: # %continue -; RV64-NEXT: li a0, 1 -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: umulo2.br.i64: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: add a1, a0, a0 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB65_2 -; RV64ZBA-NEXT: # %bb.1: # %overflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB65_2: # %continue -; RV64ZBA-NEXT: li a0, 1 -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: umulo2.br.i64: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: add a1, a0, a0 -; RV64ZICOND-NEXT: bgeu a1, a0, .LBB65_2 -; RV64ZICOND-NEXT: # %bb.1: # %overflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: ret -; RV64ZICOND-NEXT: .LBB65_2: # %continue -; RV64ZICOND-NEXT: li a0, 1 -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %overflow, label %continue - -overflow: - ret i1 false - -continue: - ret i1 true -} - -define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) { -; RV64-LABEL: uaddo.i64.constant: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, 2 -; RV64-NEXT: sltu a0, a2, a0 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64.constant: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, 2 -; RV64ZBA-NEXT: sltu a0, a2, a0 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64.constant: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, 2 -; RV64ZICOND-NEXT: sltu a0, a2, a0 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) { -; RV64-LABEL: uaddo.i64.constant_2048: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, 2047 -; RV64-NEXT: addi a2, a2, 1 -; RV64-NEXT: sltu a0, a2, a0 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64.constant_2048: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, 2047 -; RV64ZBA-NEXT: addi a2, a2, 1 -; RV64ZBA-NEXT: sltu a0, a2, a0 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64.constant_2048: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, 2047 -; RV64ZICOND-NEXT: addi a2, a2, 1 -; RV64ZICOND-NEXT: sltu a0, a2, a0 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2048) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) { -; RV64-LABEL: uaddo.i64.constant_2049: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a0, 2047 -; RV64-NEXT: addi a2, a2, 2 -; RV64-NEXT: sltu a0, a2, a0 -; RV64-NEXT: sd a2, 0(a1) -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64.constant_2049: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: addi a2, a0, 2047 -; RV64ZBA-NEXT: addi a2, a2, 2 -; RV64ZBA-NEXT: sltu a0, a2, a0 -; RV64ZBA-NEXT: sd a2, 0(a1) -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64.constant_2049: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: addi a2, a0, 2047 -; RV64ZICOND-NEXT: addi a2, a2, 2 -; RV64ZICOND-NEXT: sltu a0, a2, a0 -; RV64ZICOND-NEXT: sd a2, 0(a1) -; RV64ZICOND-NEXT: ret -entry: - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - store i64 %val, ptr %res - ret i1 %obit -} - -define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) { -; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: -; RV64: # %bb.0: # %entry -; RV64-NEXT: ld a1, 0(a0) -; RV64-NEXT: addi a0, a1, 2 -; RV64-NEXT: bltu a0, a1, .LBB69_2 -; RV64-NEXT: # %bb.1: # %IfOverflow -; RV64-NEXT: li a0, 0 -; RV64-NEXT: .LBB69_2: # %IfNoOverflow -; RV64-NEXT: ret -; -; RV64ZBA-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: -; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: ld a1, 0(a0) -; RV64ZBA-NEXT: addi a0, a1, 2 -; RV64ZBA-NEXT: bltu a0, a1, .LBB69_2 -; RV64ZBA-NEXT: # %bb.1: # %IfOverflow -; RV64ZBA-NEXT: li a0, 0 -; RV64ZBA-NEXT: .LBB69_2: # %IfNoOverflow -; RV64ZBA-NEXT: ret -; -; RV64ZICOND-LABEL: uaddo.i64.constant_setcc_on_overflow_flag: -; RV64ZICOND: # %bb.0: # %entry -; RV64ZICOND-NEXT: ld a1, 0(a0) -; RV64ZICOND-NEXT: addi a0, a1, 2 -; RV64ZICOND-NEXT: bltu a0, a1, .LBB69_2 -; RV64ZICOND-NEXT: # %bb.1: # %IfOverflow -; RV64ZICOND-NEXT: li a0, 0 -; RV64ZICOND-NEXT: .LBB69_2: # %IfNoOverflow -; RV64ZICOND-NEXT: ret -entry: - %v1 = load i64, ptr %p - %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2) - %val = extractvalue {i64, i1} %t, 0 - %obit = extractvalue {i64, i1} %t, 1 - br i1 %obit, label %IfNoOverflow, label %IfOverflow -IfOverflow: - ret i64 0 -IfNoOverflow: - ret i64 %val -} - -declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone -declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone -declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone -declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone -declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone -declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone -declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 27a5773e64043..1c14e2ca5ef87 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -29,10 +29,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { ; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vrgather.vv v13, v10, v12 ; CHECK-NEXT: vadd.vi v10, v11, -15 -; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vrgather.vv v13, v8, v10, v0.t +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmsne.vi v8, v13, 0 -; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %vec = load <32 x i1>, ptr %p %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll index 5e68d8cbb0755..016f95bfef7e7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll @@ -1379,9 +1379,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV32-NEXT: vmv.s.x v9, a1 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmsne.vi v9, v9, 0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vmv.v.i v9, 1 @@ -1400,9 +1400,9 @@ define i8 @vpreduce_mul_v1i8(i8 %s, <1 x i8> %v, <1 x i1> %m, i32 zeroext %evl) ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma ; RV64-NEXT: vmv.s.x v9, a1 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmsne.vi v9, v9, 0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vmv.v.i v9, 1 @@ -1427,10 +1427,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV32-NEXT: vid.v v9 ; RV32-NEXT: vmsltu.vx v9, v9, a1 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1452,10 +1452,10 @@ define signext i8 @vpreduce_mul_v2i8(i8 signext %s, <2 x i8> %v, <2 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; RV64-NEXT: vid.v v9 ; RV64-NEXT: vmsltu.vx v9, v9, a1 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1483,10 +1483,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vid.v v9 ; RV32-NEXT: vmsltu.vx v9, v9, a1 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1510,10 +1510,10 @@ define signext i8 @vpreduce_mul_v4i8(i8 signext %s, <4 x i8> %v, <4 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vid.v v9 ; RV64-NEXT: vmsltu.vx v9, v9, a1 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1543,10 +1543,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vid.v v10 ; RV32-NEXT: vmsltu.vx v9, v10, a1 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1572,10 +1572,10 @@ define signext i8 @vpreduce_mul_v8i8(i8 signext %s, <8 x i8> %v, <8 x i1> %m, i3 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV64-NEXT: vid.v v10 ; RV64-NEXT: vmsltu.vx v9, v10, a1 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1607,10 +1607,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vid.v v12 ; RV32-NEXT: vmsltu.vx v9, v12, a1 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmand.mm v0, v9, v0 ; RV32-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV32-NEXT: vmv.v.i v9, 1 @@ -1638,10 +1638,10 @@ define signext i8 @vpreduce_mul_v16i8(i8 signext %s, <16 x i8> %v, <16 x i1> %m, ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV64-NEXT: vid.v v12 ; RV64-NEXT: vmsltu.vx v9, v12, a1 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmand.mm v0, v9, v0 ; RV64-NEXT: vsetvli zero, zero, e8, m1, ta, ma ; RV64-NEXT: vmv.v.i v9, 1 @@ -1754,10 +1754,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV32-NEXT: addi a2, a2, %lo(.LCPI72_0) ; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV32-NEXT: vle8.v v12, (a2) +; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vid.v v16 ; RV32-NEXT: vmsltu.vx v14, v16, a1 ; RV32-NEXT: vsext.vf4 v16, v12 -; RV32-NEXT: mv a2, a0 ; RV32-NEXT: vmsltu.vx v12, v16, a1 ; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV32-NEXT: vslideup.vi v14, v12, 4 @@ -1798,10 +1798,10 @@ define signext i8 @vpreduce_mul_v64i8(i8 signext %s, <64 x i8> %v, <64 x i1> %m, ; RV64-NEXT: addi a2, a2, %lo(.LCPI72_0) ; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma ; RV64-NEXT: vle8.v v12, (a2) +; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vid.v v16 ; RV64-NEXT: vmsltu.vx v14, v16, a1 ; RV64-NEXT: vsext.vf4 v16, v12 -; RV64-NEXT: mv a2, a0 ; RV64-NEXT: vmsltu.vx v12, v16, a1 ; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; RV64-NEXT: vslideup.vi v14, v12, 4 diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll index 6300571686013..2de2b750e6808 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -26,10 +26,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { ; CHECK-NEXT: vadd.vi v12, v11, 1 ; CHECK-NEXT: vrgather.vv v13, v10, v12 ; CHECK-NEXT: vadd.vi v10, v11, -15 -; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: vrgather.vv v13, v8, v10, v0.t +; CHECK-NEXT: vmsne.vi v0, v9, 0 ; CHECK-NEXT: vmsne.vi v8, v13, 0 -; CHECK-NEXT: vmv.v.v v0, v9 ; CHECK-NEXT: ret %retval = call {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1> %vec) ret {<16 x i1>, <16 x i1>} %retval diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll index 6a4ebb6b30af2..bcb008857ad32 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -97,10 +97,10 @@ define {, } @vector_deinterleave_nxv64i1_nxv ; CHECK-NEXT: slli a0, a0, 1 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 -; CHECK-NEXT: vmv1r.v v12, v8 ; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma ; CHECK-NEXT: vnsrl.wi v8, v16, 0 ; CHECK-NEXT: vmv1r.v v0, v12 diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index bbf1df20aeda5..1efd57194fba6 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -460,9 +460,8 @@ define ptr @sdia(ptr %base, i64 %a, i64 %b) { ; RV32XTHEADMEMIDX-NEXT: sltu a1, a3, a1 ; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 ; RV32XTHEADMEMIDX-NEXT: sw a3, 0(a0) -; RV32XTHEADMEMIDX-NEXT: addi a5, a0, 64 ; RV32XTHEADMEMIDX-NEXT: sw a1, 4(a0) -; RV32XTHEADMEMIDX-NEXT: mv a0, a5 +; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 64 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: sdia: From efc8bd617b1361f91c7d415060f4360bf27f8e68 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Wed, 21 Aug 2024 22:29:01 +0200 Subject: [PATCH 03/15] Formatting --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 39 +++++++++++---------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 7c5bf9b5c3fac..750eaf2de4857 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -121,9 +121,8 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs { } }; -static std::optional> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, - SUnit *Src, - ScheduleDAGMCP &DG) { +static std::optional> +moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { MachineInstr *DstInstr = Dst->getInstr(); MachineInstr *SrcInstr = Src->getInstr(); MachineBasicBlock *MBB = SrcInstr->getParent(); @@ -185,15 +184,15 @@ static std::optional> moveInstructionsOutOfThe // processing stage. In some context it does matter what the parent of the // instruction was: Namely when we are starting the traversal with the source // of the copy propagation. This instruction must have the destination as a - // dependency. In case of other instruction than has the destination as a dependency, this - // dependency would mean the end of the traversal, but in this scenario this - // must be ignored. Let's say that we can not control what nodes to process - // and we come across the copy source. How do I know what node has that copy - // source as their dependency? We can check of which node is the copy source - // the dependency of. This list will alway contain the source. To decide if we - // have it as dependency of another instruction, we must check in the already - // traversed list if any of the instructions that is depended on the source is - // contained. This would introduce extra costs. + // dependency. In case of other instruction than has the destination as a + // dependency, this dependency would mean the end of the traversal, but in + // this scenario this must be ignored. Let's say that we can not control what + // nodes to process and we come across the copy source. How do I know what + // node has that copy source as their dependency? We can check of which node + // is the copy source the dependency of. This list will alway contain the + // source. To decide if we have it as dependency of another instruction, we + // must check in the already traversed list if any of the instructions that is + // depended on the source is contained. This would introduce extra costs. ProcessSNodeChildren(Edges, Dst, true); while (!Edges.empty()) { const auto *Current = Edges.front(); @@ -1156,9 +1155,9 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands, return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill(); } -void MachineCopyPropagation::propagateDefs(MachineInstr &MI, - ScheduleDAGMCP &DG, - bool MoveDependenciesForBetterCopyPropagation) { +void MachineCopyPropagation::propagateDefs( + MachineInstr &MI, ScheduleDAGMCP &DG, + bool MoveDependenciesForBetterCopyPropagation) { if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies()) return; @@ -1204,7 +1203,8 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, SUnit *DstSUnit = DG.getSUnit(Copy); SUnit *SrcSUnit = DG.getSUnit(&MI); - InstructionsToMove = moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit, DG); + InstructionsToMove = + moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit); if (!InstructionsToMove) continue; } @@ -1273,8 +1273,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( // Unlike forward cp, we don't invoke propagateDefs here, // just let forward cp do COPY-to-COPY propagation. if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) { - Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, - UseCopyInstr, MoveDependenciesForBetterCopyPropagation); + Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, UseCopyInstr, + MoveDependenciesForBetterCopyPropagation); Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII, UseCopyInstr); Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); @@ -1316,7 +1316,8 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( } } else { Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII, - UseCopyInstr, MoveDependenciesForBetterCopyPropagation); + UseCopyInstr, + MoveDependenciesForBetterCopyPropagation); } } } From f65245965fd3a2ae083ad3b7b38083ec903f5e02 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Wed, 21 Aug 2024 23:24:21 +0200 Subject: [PATCH 04/15] Update tests --- .../AArch64/GlobalISel/arm64-atomic.ll | 13 +- .../CodeGen/AArch64/arm64-non-pow2-ldst.ll | 5 +- llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll | 12 +- llvm/test/CodeGen/AArch64/fcmp.ll | 18 +- llvm/test/CodeGen/AArch64/sext.ll | 41 +- .../AArch64/sve-vector-deinterleave.ll | 10 +- .../CodeGen/AArch64/sve-vector-interleave.ll | 6 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 49 +- llvm/test/CodeGen/AArch64/zext-to-tbl.ll | 18 +- llvm/test/CodeGen/AArch64/zext.ll | 41 +- .../CodeGen/ARM/Windows/wineh-framepointer.ll | 3 + llvm/test/CodeGen/ARM/aes-erratum-fix.ll | 14 +- llvm/test/CodeGen/ARM/fpclamptosat_vec.ll | 53 +- .../test/CodeGen/ARM/mcp-dest-regs-no-dup.mir | 3 + .../CodeGen/ARM/srem-seteq-illegal-types.ll | 20 +- .../ARM/vecreduce-fadd-legalization-strict.ll | 14 +- llvm/test/CodeGen/Thumb/smul_fix_sat.ll | 12 +- .../Thumb/umulo-128-legalisation-lowering.ll | 4 +- .../CodeGen/Thumb2/mve-fpclamptosat_vec.ll | 89 +-- .../CodeGen/Thumb2/mve-fptosi-sat-vector.ll | 51 +- .../CodeGen/Thumb2/mve-fptoui-sat-vector.ll | 47 +- .../CodeGen/Thumb2/mve-laneinterleaving.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-minmax.ll | 7 +- llvm/test/CodeGen/Thumb2/mve-pred-ext.ll | 26 +- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 24 +- llvm/test/CodeGen/Thumb2/mve-shufflemov.ll | 50 +- llvm/test/CodeGen/Thumb2/mve-vabdus.ll | 6 +- llvm/test/CodeGen/Thumb2/mve-vcvt16.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vld4.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vmovn.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-vst4.ll | 14 +- .../CodeGen/Thumb2/mve-zext-masked-load.ll | 2 +- llvm/test/CodeGen/X86/apx/mul-i1024.ll | 31 +- llvm/test/CodeGen/X86/atomic-unordered.ll | 2 +- llvm/test/CodeGen/X86/avx512-calling-conv.ll | 38 +- .../CodeGen/X86/avx512-gfni-intrinsics.ll | 72 +- .../test/CodeGen/X86/avx512-insert-extract.ll | 14 +- llvm/test/CodeGen/X86/avx512-mask-op.ll | 10 +- .../X86/avx512bw-intrinsics-upgrade.ll | 4 +- .../X86/avx512vl-intrinsics-upgrade.ll | 12 +- .../X86/div-rem-pair-recomposition-signed.ll | 3 +- .../div-rem-pair-recomposition-unsigned.ll | 8 +- llvm/test/CodeGen/X86/extract-bits.ll | 18 +- llvm/test/CodeGen/X86/legalize-shl-vec.ll | 12 +- llvm/test/CodeGen/X86/matrix-multiply.ll | 71 +- llvm/test/CodeGen/X86/mul-i256.ll | 6 +- llvm/test/CodeGen/X86/mul-i512.ll | 2 +- .../X86/peephole-na-phys-copy-folding.ll | 5 +- llvm/test/CodeGen/X86/pmulh.ll | 18 +- llvm/test/CodeGen/X86/pr34177.ll | 2 +- llvm/test/CodeGen/X86/pr61964.ll | 6 +- llvm/test/CodeGen/X86/shift-i128.ll | 19 +- llvm/test/CodeGen/X86/smul_fix.ll | 6 +- llvm/test/CodeGen/X86/smul_fix_sat.ll | 3 +- .../X86/smulo-128-legalisation-lowering.ll | 18 +- .../subvectorwise-store-of-vector-splat.ll | 38 +- llvm/test/CodeGen/X86/umul_fix.ll | 6 +- llvm/test/CodeGen/X86/umul_fix_sat.ll | 6 +- llvm/test/CodeGen/X86/vec_umulo.ll | 9 +- .../vector-interleaved-load-i16-stride-2.ll | 4 +- .../vector-interleaved-load-i16-stride-3.ll | 35 +- .../vector-interleaved-load-i16-stride-4.ll | 4 +- .../vector-interleaved-load-i16-stride-5.ll | 48 +- .../vector-interleaved-load-i16-stride-6.ll | 36 +- .../vector-interleaved-load-i16-stride-7.ll | 59 +- .../vector-interleaved-load-i16-stride-8.ll | 84 +- .../vector-interleaved-load-i32-stride-3.ll | 76 +- .../vector-interleaved-load-i32-stride-4.ll | 64 +- .../vector-interleaved-load-i32-stride-5.ll | 248 +++--- .../vector-interleaved-load-i32-stride-6.ll | 93 +-- .../vector-interleaved-load-i32-stride-7.ll | 122 +-- .../vector-interleaved-load-i32-stride-8.ll | 406 +++++----- .../vector-interleaved-load-i64-stride-4.ll | 33 +- .../vector-interleaved-load-i64-stride-5.ll | 42 +- .../vector-interleaved-load-i64-stride-6.ll | 40 +- .../vector-interleaved-load-i64-stride-7.ll | 468 ++++++----- .../vector-interleaved-load-i64-stride-8.ll | 750 +++++++++--------- .../vector-interleaved-load-i8-stride-3.ll | 10 +- .../vector-interleaved-load-i8-stride-4.ll | 14 +- .../vector-interleaved-load-i8-stride-5.ll | 10 +- .../vector-interleaved-load-i8-stride-6.ll | 16 +- .../vector-interleaved-load-i8-stride-7.ll | 122 ++- .../vector-interleaved-load-i8-stride-8.ll | 102 +-- .../vector-interleaved-store-i16-stride-3.ll | 150 ++-- .../vector-interleaved-store-i16-stride-4.ll | 64 +- .../vector-interleaved-store-i16-stride-5.ll | 57 +- .../vector-interleaved-store-i16-stride-6.ll | 87 +- .../vector-interleaved-store-i16-stride-7.ll | 75 +- .../vector-interleaved-store-i16-stride-8.ll | 60 +- .../vector-interleaved-store-i32-stride-2.ll | 57 +- .../vector-interleaved-store-i32-stride-3.ll | 11 +- .../vector-interleaved-store-i32-stride-5.ll | 35 +- .../vector-interleaved-store-i32-stride-6.ll | 192 ++--- .../vector-interleaved-store-i32-stride-7.ll | 189 +++-- .../vector-interleaved-store-i32-stride-8.ll | 8 +- .../vector-interleaved-store-i64-stride-3.ll | 20 +- .../vector-interleaved-store-i64-stride-4.ll | 176 ++-- .../vector-interleaved-store-i64-stride-5.ll | 20 +- .../vector-interleaved-store-i64-stride-7.ll | 544 ++++++------- .../vector-interleaved-store-i64-stride-8.ll | 304 +++---- .../vector-interleaved-store-i8-stride-3.ll | 5 +- .../vector-interleaved-store-i8-stride-5.ll | 15 +- .../vector-interleaved-store-i8-stride-6.ll | 71 +- .../vector-interleaved-store-i8-stride-7.ll | 35 +- .../vector-interleaved-store-i8-stride-8.ll | 15 +- llvm/test/CodeGen/X86/vector-intrinsics.ll | 4 +- .../X86/vector-shuffle-combining-avx.ll | 23 +- llvm/test/CodeGen/X86/vector-zext.ll | 9 +- .../X86/wide-scalar-shift-legalization.ll | 2 +- .../CodeGen/X86/x86-interleaved-access.ll | 21 +- llvm/test/CodeGen/X86/xmulo.ll | 12 +- 111 files changed, 3035 insertions(+), 3221 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll index 92575d701f428..da75b5bbeb33b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll @@ -106,11 +106,10 @@ define i32 @val_compare_and_swap_from_load(ptr %p, i32 %cmp, ptr %pnew) #0 { ; CHECK-OUTLINE-O1-LABEL: val_compare_and_swap_from_load: ; CHECK-OUTLINE-O1: ; %bb.0: ; CHECK-OUTLINE-O1-NEXT: stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill -; CHECK-OUTLINE-O1-NEXT: ldr w8, [x2] ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 ; CHECK-OUTLINE-O1-NEXT: mov w0, w1 +; CHECK-OUTLINE-O1-NEXT: ldr w1, [x2] ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 -; CHECK-OUTLINE-O1-NEXT: mov w1, w8 ; CHECK-OUTLINE-O1-NEXT: bl ___aarch64_cas4_acq ; CHECK-OUTLINE-O1-NEXT: ldp x29, x30, [sp], #16 ; 16-byte Folded Reload ; CHECK-OUTLINE-O1-NEXT: ret @@ -6026,8 +6025,8 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 +; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6133,8 +6132,8 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 +; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6238,8 +6237,8 @@ define { i32, i1 } @cmpxchg_i32(ptr %ptr, i32 %desired, i32 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 +; CHECK-OUTLINE-O1-NEXT: mov w19, w1 ; CHECK-OUTLINE-O1-NEXT: mov w1, w2 ; CHECK-OUTLINE-O1-NEXT: mov w0, w19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6336,8 +6335,8 @@ define { i64, i1 } @cmpxchg_i64(ptr %ptr, i64 %desired, i64 %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x19, x1 ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 +; CHECK-OUTLINE-O1-NEXT: mov x19, x1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: mov x0, x19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 @@ -6434,8 +6433,8 @@ define { ptr, i1 } @cmpxchg_ptr(ptr %ptr, ptr %desired, ptr %new) { ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w29, -16 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w19, -24 ; CHECK-OUTLINE-O1-NEXT: .cfi_offset w20, -32 -; CHECK-OUTLINE-O1-NEXT: mov x19, x1 ; CHECK-OUTLINE-O1-NEXT: mov x3, x0 +; CHECK-OUTLINE-O1-NEXT: mov x19, x1 ; CHECK-OUTLINE-O1-NEXT: mov x1, x2 ; CHECK-OUTLINE-O1-NEXT: mov x0, x19 ; CHECK-OUTLINE-O1-NEXT: mov x2, x3 diff --git a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll index cd821675bae6e..91d8cf98753c9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll +++ b/llvm/test/CodeGen/AArch64/arm64-non-pow2-ldst.ll @@ -54,11 +54,10 @@ define i280 @ldi280(ptr %p) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: ldrb w9, [x0, #34] ; CHECK-NEXT: ldrh w10, [x0, #32] -; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldp x1, x2, [x0, #8] ; CHECK-NEXT: ldr x3, [x0, #24] +; CHECK-NEXT: ldp x1, x2, [x0, #8] +; CHECK-NEXT: ldr x0, [x0] ; CHECK-NEXT: orr x4, x10, x9, lsl #16 -; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret %r = load i280, ptr %p ret i280 %r diff --git a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll index 4558d7c464fe3..186d191444feb 100644 --- a/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll +++ b/llvm/test/CodeGen/AArch64/cmpxchg-idioms.ll @@ -31,10 +31,10 @@ define i32 @test_return(ptr %p, i32 %oldval, i32 %newval) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov w19, w1 ; OUTLINE-ATOMICS-NEXT: mov x8, x0 +; OUTLINE-ATOMICS-NEXT: mov w19, w1 +; OUTLINE-ATOMICS-NEXT: mov w0, w1 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 -; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas4_acq_rel ; OUTLINE-ATOMICS-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload @@ -81,10 +81,10 @@ define i1 @test_return_bool(ptr %value, i8 %oldValue, i8 %newValue) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov w19, w1 ; OUTLINE-ATOMICS-NEXT: mov x8, x0 +; OUTLINE-ATOMICS-NEXT: mov w19, w1 +; OUTLINE-ATOMICS-NEXT: mov w0, w1 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 -; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas1_acq_rel ; OUTLINE-ATOMICS-NEXT: cmp w0, w19, uxtb @@ -126,10 +126,10 @@ define void @test_conditional(ptr %p, i32 %oldval, i32 %newval) { ; OUTLINE-ATOMICS-NEXT: .cfi_offset w29, -16 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w19, -24 ; OUTLINE-ATOMICS-NEXT: .cfi_offset w20, -32 -; OUTLINE-ATOMICS-NEXT: mov w19, w1 ; OUTLINE-ATOMICS-NEXT: mov x8, x0 +; OUTLINE-ATOMICS-NEXT: mov w19, w1 +; OUTLINE-ATOMICS-NEXT: mov w0, w1 ; OUTLINE-ATOMICS-NEXT: mov w1, w2 -; OUTLINE-ATOMICS-NEXT: mov w0, w19 ; OUTLINE-ATOMICS-NEXT: mov x2, x8 ; OUTLINE-ATOMICS-NEXT: bl ___aarch64_cas4_acq_rel ; OUTLINE-ATOMICS-NEXT: cmp w0, w19 diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index cd227a0235766..a5d7ae147ffda 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -1618,23 +1618,23 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1 ; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5 ; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8 -; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NOFP16-NEXT: fmov w9, s5 +; CHECK-GI-NOFP16-NEXT: fmov s5, w7 ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v6.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #8] -; CHECK-GI-NOFP16-NEXT: fmov s5, w7 +; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v7.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s7, [sp, #24] ; CHECK-GI-NOFP16-NEXT: mov v16.s[2], w2 -; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 -; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-NOFP16-NEXT: mov v5.s[1], w9 ; CHECK-GI-NOFP16-NEXT: fmov w9, s6 ; CHECK-GI-NOFP16-NEXT: ldr s6, [sp, #16] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 +; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0] ; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3 ; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9 @@ -1687,7 +1687,6 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] ; CHECK-GI-FP16-NEXT: fmov s1, w8 ; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] -; CHECK-GI-FP16-NEXT: fmov s5, w9 ; CHECK-GI-FP16-NEXT: mov v2.s[2], w10 ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 @@ -1695,19 +1694,20 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] ; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 ; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff -; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 +; CHECK-GI-FP16-NEXT: fmov s5, w9 ; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 ; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s3 ; CHECK-GI-FP16-NEXT: fmov s3, w7 -; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 +; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 ; CHECK-GI-FP16-NEXT: sshr v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s4 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] ; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s +; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 ; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: fmov w8, s4 diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 7d7b862098879..2dcd71b39bd0e 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -337,9 +337,9 @@ define <3 x i64> @sext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-LABEL: sext_v3i16_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: smov x8, v0.h[0] ; CHECK-GI-NEXT: smov x9, v0.h[1] ; CHECK-GI-NEXT: smov x10, v0.h[2] -; CHECK-GI-NEXT: smov x8, v0.h[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -362,9 +362,9 @@ define <3 x i64> @sext_v3i32_v3i64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: sext_v3i32_v3i64: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: smov x9, v0.s[1] ; CHECK-GI-NEXT: smov x10, v0.s[2] -; CHECK-GI-NEXT: smov x8, v0.s[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -702,20 +702,18 @@ entry: define <8 x i64> @sext_v8i32_v8i64(<8 x i32> %a) { ; CHECK-SD-LABEL: sext_v8i32_v8i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v4.2d, v0.4s, #0 ; CHECK-SD-NEXT: sshll2 v3.2d, v1.4s, #0 -; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: sshll v2.2d, v1.2s, #0 -; CHECK-SD-NEXT: mov v1.16b, v4.16b +; CHECK-SD-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v8i32_v8i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll2 v5.2d, v0.4s, #0 ; CHECK-GI-NEXT: sshll v2.2d, v1.2s, #0 -; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll2 v3.2d, v1.4s, #0 -; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: sshll2 v1.2d, v0.4s, #0 +; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ret entry: %c = sext <8 x i32> %a to <8 x i64> @@ -879,20 +877,18 @@ entry: define <16 x i32> @sext_v16i16_v16i32(<16 x i16> %a) { ; CHECK-SD-LABEL: sext_v16i16_v16i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll2 v4.4s, v0.8h, #0 ; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-SD-NEXT: mov v1.16b, v4.16b +; CHECK-SD-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v16i16_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll2 v5.4s, v0.8h, #0 ; CHECK-GI-NEXT: sshll v2.4s, v1.4h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: sshll2 v1.4s, v0.8h, #0 +; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ret entry: %c = sext <16 x i16> %a to <16 x i32> @@ -939,29 +935,26 @@ entry: define <16 x i64> @sext_v16i32_v16i64(<16 x i32> %a) { ; CHECK-SD-LABEL: sext_v16i32_v16i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sshll v18.2d, v1.2s, #0 -; CHECK-SD-NEXT: sshll2 v16.2d, v1.4s, #0 -; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: sshll2 v7.2d, v3.4s, #0 +; CHECK-SD-NEXT: sshll v6.2d, v3.2s, #0 +; CHECK-SD-NEXT: sshll2 v3.2d, v1.4s, #0 ; CHECK-SD-NEXT: sshll v4.2d, v2.2s, #0 ; CHECK-SD-NEXT: sshll2 v5.2d, v2.4s, #0 -; CHECK-SD-NEXT: sshll v6.2d, v3.2s, #0 +; CHECK-SD-NEXT: sshll v2.2d, v1.2s, #0 ; CHECK-SD-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-SD-NEXT: mov v2.16b, v18.16b -; CHECK-SD-NEXT: mov v3.16b, v16.16b +; CHECK-SD-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sext_v16i32_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll2 v19.2d, v1.4s, #0 -; CHECK-GI-NEXT: sshll2 v17.2d, v0.4s, #0 ; CHECK-GI-NEXT: sshll v18.2d, v1.2s, #0 +; CHECK-GI-NEXT: sshll2 v19.2d, v1.4s, #0 +; CHECK-GI-NEXT: sshll2 v1.2d, v0.4s, #0 ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll v4.2d, v2.2s, #0 +; CHECK-GI-NEXT: sshll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: sshll v6.2d, v3.2s, #0 ; CHECK-GI-NEXT: sshll2 v7.2d, v3.4s, #0 -; CHECK-GI-NEXT: sshll2 v5.2d, v2.4s, #0 -; CHECK-GI-NEXT: mov v1.16b, v17.16b ; CHECK-GI-NEXT: mov v2.16b, v18.16b ; CHECK-GI-NEXT: mov v3.16b, v19.16b ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll index eafb71a0b23a3..63f2df84d8173 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll @@ -170,11 +170,10 @@ define {, } @vector_deinterleave_nxv2i1_nxv4i1 define {, } @vector_deinterleave_nxv4i64_nxv8i64( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d ; CHECK-NEXT: uzp2 z6.d, z0.d, z1.d ; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d ; CHECK-NEXT: uzp2 z3.d, z2.d, z3.d -; CHECK-NEXT: mov z1.d, z4.d ; CHECK-NEXT: mov z2.d, z6.d ; CHECK-NEXT: ret %retval = call {, } @llvm.vector.deinterleave2.nxv8i64( %vec) @@ -184,15 +183,14 @@ ret {, } %retval define {, } @vector_deinterleave_nxv8i64_nxv16i64( %vec) { ; CHECK-LABEL: vector_deinterleave_nxv8i64_nxv16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: uzp1 z24.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d ; CHECK-NEXT: uzp2 z28.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z29.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d ; CHECK-NEXT: uzp1 z3.d, z6.d, z7.d ; CHECK-NEXT: uzp2 z7.d, z6.d, z7.d ; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d -; CHECK-NEXT: mov z1.d, z24.d ; CHECK-NEXT: mov z4.d, z28.d ; CHECK-NEXT: mov z5.d, z29.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index fe089fa4a6417..39dda7fdfb57d 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -165,10 +165,9 @@ define @interleave2_nxv16i32( %vec0, @llvm.vector.interleave2.nxv16i32( %vec0, %vec1) @@ -179,10 +178,9 @@ define @interleave2_nxv8i64( %vec0, @llvm.vector.interleave2.nxv8i64( %vec0, %vec1) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 150a67ab7974d..54ada05c90448 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -4819,47 +4819,45 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: usubl v1.8h, v1.8b, v2.8b ; CHECK-GI-NEXT: ldr d3, [x10] ; CHECK-GI-NEXT: ldr d4, [x11] -; CHECK-GI-NEXT: add x10, x10, x9 -; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: sshll v5.4s, v0.4h, #0 ; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: ldr d2, [x10] -; CHECK-GI-NEXT: ldr d6, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: ldr d2, [x10] +; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: sshll v7.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 +; CHECK-GI-NEXT: ldr d6, [x11] +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: usubl v3.8h, v3.8b, v4.8b -; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d16, [x11] ; CHECK-GI-NEXT: abs v5.4s, v5.4s ; CHECK-GI-NEXT: abs v0.4s, v0.4s +; CHECK-GI-NEXT: ldr d4, [x10] +; CHECK-GI-NEXT: ldr d16, [x11] ; CHECK-GI-NEXT: abs v7.4s, v7.4s +; CHECK-GI-NEXT: abs v1.4s, v1.4s ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 -; CHECK-GI-NEXT: abs v1.4s, v1.4s -; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b ; CHECK-GI-NEXT: usubl v2.8h, v2.8b, v6.8b ; CHECK-GI-NEXT: ldr d6, [x10] ; CHECK-GI-NEXT: ldr d17, [x11] ; CHECK-GI-NEXT: add x10, x10, x9 ; CHECK-GI-NEXT: add x11, x11, x8 +; CHECK-GI-NEXT: usubl v4.8h, v4.8b, v16.8b +; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 +; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-GI-NEXT: ldr d5, [x10] ; CHECK-GI-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-GI-NEXT: ldr d5, [x10] ; CHECK-GI-NEXT: ldr d7, [x11] -; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-GI-NEXT: sshll v16.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-GI-NEXT: sshll v18.4s, v2.4h, #0 ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: usubl v6.8h, v6.8b, v17.8b +; CHECK-GI-NEXT: ldr d17, [x11, x8] +; CHECK-GI-NEXT: sshll v19.4s, v4.4h, #0 ; CHECK-GI-NEXT: usubl v5.8h, v5.8b, v7.8b ; CHECK-GI-NEXT: ldr d7, [x10, x9] -; CHECK-GI-NEXT: ldr d17, [x11, x8] -; CHECK-GI-NEXT: abs v19.4s, v19.4s -; CHECK-GI-NEXT: abs v4.4s, v4.4s +; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 ; CHECK-GI-NEXT: abs v16.4s, v16.4s ; CHECK-GI-NEXT: abs v3.4s, v3.4s ; CHECK-GI-NEXT: abs v18.4s, v18.4s @@ -4867,33 +4865,36 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI-NEXT: usubl v7.8h, v7.8b, v17.8b ; CHECK-GI-NEXT: sshll v17.4s, v6.4h, #0 ; CHECK-GI-NEXT: sshll2 v6.4s, v6.8h, #0 -; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s -; CHECK-GI-NEXT: addv s1, v1.4s -; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: abs v19.4s, v19.4s +; CHECK-GI-NEXT: abs v4.4s, v4.4s ; CHECK-GI-NEXT: add v3.4s, v16.4s, v3.4s ; CHECK-GI-NEXT: sshll v16.4s, v5.4h, #0 ; CHECK-GI-NEXT: sshll2 v5.4s, v5.8h, #0 ; CHECK-GI-NEXT: add v2.4s, v18.4s, v2.4s ; CHECK-GI-NEXT: abs v17.4s, v17.4s +; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: abs v6.4s, v6.4s -; CHECK-GI-NEXT: addv s4, v4.4s +; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: add v4.4s, v19.4s, v4.4s +; CHECK-GI-NEXT: addv s3, v3.4s ; CHECK-GI-NEXT: sshll v18.4s, v7.4h, #0 ; CHECK-GI-NEXT: sshll2 v7.4s, v7.8h, #0 ; CHECK-GI-NEXT: abs v16.4s, v16.4s ; CHECK-GI-NEXT: abs v5.4s, v5.4s ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: add v6.4s, v17.4s, v6.4s -; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: addv s2, v2.4s -; CHECK-GI-NEXT: addv s3, v3.4s -; CHECK-GI-NEXT: fmov w10, s4 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: addv s4, v4.4s +; CHECK-GI-NEXT: fmov w10, s3 ; CHECK-GI-NEXT: abs v18.4s, v18.4s ; CHECK-GI-NEXT: abs v7.4s, v7.4s ; CHECK-GI-NEXT: add v1.4s, v16.4s, v5.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: fmov w9, s2 ; CHECK-GI-NEXT: addv s3, v6.4s +; CHECK-GI-NEXT: fmov w9, s2 ; CHECK-GI-NEXT: add w8, w10, w8 +; CHECK-GI-NEXT: fmov w10, s4 ; CHECK-GI-NEXT: add v0.4s, v18.4s, v7.4s ; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: add w8, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 38ad96df79cf6..66bb131ce7249 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -2848,21 +2848,21 @@ define i32 @test_widening_instr_mull(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x1] ; CHECK-BE-NEXT: ld1 { v1.8h }, [x0] +; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] ; CHECK-BE-NEXT: add x9, x0, #48 ; CHECK-BE-NEXT: add x10, x0, #32 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: add x1, x1, #16 ; CHECK-BE-NEXT: ushll v2.8h, v0.8b, #0 -; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-BE-NEXT: ld1 { v3.8h }, [x8] -; CHECK-BE-NEXT: add x8, x0, #16 ; CHECK-BE-NEXT: umull v4.4s, v1.4h, v2.4h -; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h +; CHECK-BE-NEXT: umull2 v5.4s, v3.8h, v0.8h ; CHECK-BE-NEXT: umull v0.4s, v3.4h, v0.4h -; CHECK-BE-NEXT: st1 { v5.4s }, [x9] +; CHECK-BE-NEXT: umull2 v1.4s, v1.8h, v2.8h ; CHECK-BE-NEXT: st1 { v4.4s }, [x0] ; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v0.4s }, [x10] ; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: b.ne .LBB24_1 @@ -2980,16 +2980,16 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: subs w2, w2, #1 ; CHECK-BE-NEXT: ext v17.16b, v5.16b, v5.16b, #8 -; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: ext v19.16b, v6.16b, v6.16b, #8 +; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: rev32 v21.8b, v7.8b ; CHECK-BE-NEXT: rev32 v23.8b, v4.8b ; CHECK-BE-NEXT: ext v7.16b, v7.16b, v7.16b, #8 ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v6.8b, v6.8b ; CHECK-BE-NEXT: rev32 v17.8b, v17.8b -; CHECK-BE-NEXT: umull v5.2d, v5.2s, v18.2s ; CHECK-BE-NEXT: rev32 v19.8b, v19.8b +; CHECK-BE-NEXT: umull v5.2d, v5.2s, v18.2s ; CHECK-BE-NEXT: umull v18.2d, v21.2s, v22.2s ; CHECK-BE-NEXT: ext v21.16b, v22.16b, v22.16b, #8 ; CHECK-BE-NEXT: rev32 v7.8b, v7.8b @@ -2997,9 +2997,9 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: ext v16.16b, v16.16b, v16.16b, #8 ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: umull v17.2d, v17.2s, v24.2s +; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s ; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: umull v5.2d, v6.2s, v20.2s -; CHECK-BE-NEXT: umull v19.2d, v19.2s, v25.2s ; CHECK-BE-NEXT: umull v6.2d, v7.2s, v21.2s ; CHECK-BE-NEXT: add x8, x0, #112 ; CHECK-BE-NEXT: umull v4.2d, v4.2s, v16.2s @@ -3007,11 +3007,11 @@ define i32 @test_widening_instr_mull_64(ptr %p1, ptr %p2, i32 %h) { ; CHECK-BE-NEXT: add x9, x0, #80 ; CHECK-BE-NEXT: st1 { v22.2d }, [x0] ; CHECK-BE-NEXT: st1 { v17.2d }, [x8] -; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: add x8, x0, #64 ; CHECK-BE-NEXT: st1 { v19.2d }, [x9] ; CHECK-BE-NEXT: add x9, x0, #48 ; CHECK-BE-NEXT: mov x0, x8 +; CHECK-BE-NEXT: st1 { v5.2d }, [x8] ; CHECK-BE-NEXT: st1 { v6.2d }, [x9] ; CHECK-BE-NEXT: st1 { v4.2d }, [x10] ; CHECK-BE-NEXT: b.ne .LBB25_1 diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll index 5cd7fd15735f3..aa4fe9dc4ff9a 100644 --- a/llvm/test/CodeGen/AArch64/zext.ll +++ b/llvm/test/CodeGen/AArch64/zext.ll @@ -354,9 +354,9 @@ define <3 x i64> @zext_v3i16_v3i64(<3 x i16> %a) { ; CHECK-GI-LABEL: zext_v3i16_v3i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: umov w9, v0.h[1] ; CHECK-GI-NEXT: umov w10, v0.h[2] -; CHECK-GI-NEXT: umov w8, v0.h[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -379,9 +379,9 @@ define <3 x i64> @zext_v3i32_v3i64(<3 x i32> %a) { ; ; CHECK-GI-LABEL: zext_v3i32_v3i64: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: mov w9, v0.s[1] ; CHECK-GI-NEXT: mov w10, v0.s[2] -; CHECK-GI-NEXT: mov w8, v0.s[0] ; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: fmov d1, x9 ; CHECK-GI-NEXT: fmov d2, x10 @@ -727,20 +727,18 @@ entry: define <8 x i64> @zext_v8i32_v8i64(<8 x i32> %a) { ; CHECK-SD-LABEL: zext_v8i32_v8i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v4.2d, v0.4s, #0 ; CHECK-SD-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 -; CHECK-SD-NEXT: mov v1.16b, v4.16b +; CHECK-SD-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v8i32_v8i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll2 v5.2d, v0.4s, #0 ; CHECK-GI-NEXT: ushll v2.2d, v1.2s, #0 -; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll2 v3.2d, v1.4s, #0 -; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ushll2 v1.2d, v0.4s, #0 +; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ret entry: %c = zext <8 x i32> %a to <8 x i64> @@ -896,20 +894,18 @@ entry: define <16 x i32> @zext_v16i16_v16i32(<16 x i16> %a) { ; CHECK-SD-LABEL: zext_v16i16_v16i32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll2 v4.4s, v0.8h, #0 ; CHECK-SD-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-SD-NEXT: mov v1.16b, v4.16b +; CHECK-SD-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v16i16_v16i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll2 v5.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v3.4s, v1.8h, #0 -; CHECK-GI-NEXT: mov v1.16b, v5.16b +; CHECK-GI-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-NEXT: ret entry: %c = zext <16 x i16> %a to <16 x i32> @@ -956,29 +952,26 @@ entry: define <16 x i64> @zext_v16i32_v16i64(<16 x i32> %a) { ; CHECK-SD-LABEL: zext_v16i32_v16i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v18.2d, v1.2s, #0 -; CHECK-SD-NEXT: ushll2 v16.2d, v1.4s, #0 -; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll2 v7.2d, v3.4s, #0 +; CHECK-SD-NEXT: ushll v6.2d, v3.2s, #0 +; CHECK-SD-NEXT: ushll2 v3.2d, v1.4s, #0 ; CHECK-SD-NEXT: ushll v4.2d, v2.2s, #0 ; CHECK-SD-NEXT: ushll2 v5.2d, v2.4s, #0 -; CHECK-SD-NEXT: ushll v6.2d, v3.2s, #0 +; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 ; CHECK-SD-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-SD-NEXT: mov v2.16b, v18.16b -; CHECK-SD-NEXT: mov v3.16b, v16.16b +; CHECK-SD-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: zext_v16i32_v16i64: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll2 v19.2d, v1.4s, #0 -; CHECK-GI-NEXT: ushll2 v17.2d, v0.4s, #0 ; CHECK-GI-NEXT: ushll v18.2d, v1.2s, #0 +; CHECK-GI-NEXT: ushll2 v19.2d, v1.4s, #0 +; CHECK-GI-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: ushll v4.2d, v2.2s, #0 +; CHECK-GI-NEXT: ushll2 v5.2d, v2.4s, #0 ; CHECK-GI-NEXT: ushll v6.2d, v3.2s, #0 ; CHECK-GI-NEXT: ushll2 v7.2d, v3.4s, #0 -; CHECK-GI-NEXT: ushll2 v5.2d, v2.4s, #0 -; CHECK-GI-NEXT: mov v1.16b, v17.16b ; CHECK-GI-NEXT: mov v2.16b, v18.16b ; CHECK-GI-NEXT: mov v3.16b, v19.16b ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll b/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll index 17197006d8261..96557848ed401 100644 --- a/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll +++ b/llvm/test/CodeGen/ARM/Windows/wineh-framepointer.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ;; Check that this produces the expected assembly output ; RUN: llc -mtriple=thumbv7-windows -o - %s -verify-machineinstrs | FileCheck %s ;; Also try to write an object file, which verifies that the SEH opcodes @@ -174,3 +175,5 @@ declare void @llvm.va_start(ptr) declare void @llvm.va_end(ptr) declare arm_aapcs_vfpcc void @other2(i32 noundef, ptr noundef, ptr noundef, ptr noundef) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll index 43c403fe6d64d..82f3b453e5fec 100644 --- a/llvm/test/CodeGen/ARM/aes-erratum-fix.ll +++ b/llvm/test/CodeGen/ARM/aes-erratum-fix.ll @@ -1451,8 +1451,6 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r6, d17[0] -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill @@ -1462,7 +1460,6 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r3, r6 ; CHECK-CORTEX-FIX-NEXT: b .LBB36_3 ; CHECK-CORTEX-FIX-NEXT: .LBB36_2: ; CHECK-CORTEX-FIX-NEXT: add r3, r2, #8 @@ -1479,12 +1476,12 @@ define arm_aapcs_vfpcc void @aese_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[2] ; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[3] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d17[0] ; CHECK-CORTEX-FIX-NEXT: .LBB36_3: +; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d17[0] +; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r4, d17[3] ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB36_5 @@ -3604,8 +3601,6 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: @ %bb.1: ; CHECK-CORTEX-FIX-NEXT: vld1.64 {d16, d17}, [r2] ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[1] -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r6, d17[0] -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill @@ -3615,7 +3610,6 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #12] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: ldrh r3, [r1] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #16] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: mov r3, r6 ; CHECK-CORTEX-FIX-NEXT: b .LBB82_3 ; CHECK-CORTEX-FIX-NEXT: .LBB82_2: ; CHECK-CORTEX-FIX-NEXT: add r3, r2, #8 @@ -3632,12 +3626,12 @@ define arm_aapcs_vfpcc void @aesd_setf16_cond_via_ptr(i1 zeroext %0, ptr %1, <16 ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #20] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[2] ; CHECK-CORTEX-FIX-NEXT: str r7, [sp, #12] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d16[3] ; CHECK-CORTEX-FIX-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d17[0] ; CHECK-CORTEX-FIX-NEXT: .LBB82_3: +; CHECK-CORTEX-FIX-NEXT: vmov.u16 r3, d17[0] +; CHECK-CORTEX-FIX-NEXT: vmov.u16 r7, d17[2] ; CHECK-CORTEX-FIX-NEXT: vmov.u16 r4, d17[3] ; CHECK-CORTEX-FIX-NEXT: cmp r0, #0 ; CHECK-CORTEX-FIX-NEXT: beq .LBB82_5 diff --git a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll index b8ea7c10ad2f4..631f4bcbdd51a 100644 --- a/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/ARM/fpclamptosat_vec.ll @@ -1268,6 +1268,7 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 +; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -1301,15 +1302,14 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 +; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: mov r0, r1 +; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov r0, s20 @@ -1513,6 +1513,7 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 +; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -1546,15 +1547,14 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 +; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: mov r0, r1 +; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov r0, s20 @@ -2469,11 +2469,10 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-NEXT: moveq r0, r5 ; CHECK-NEXT: rsbs r1, r0, #0 ; CHECK-NEXT: rscs r1, r3, #0 -; CHECK-NEXT: vmov r2, r1, d9 ; CHECK-NEXT: movwlt r6, #1 ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: movne r6, r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: subs r2, r0, r5 ; CHECK-NEXT: vmov.32 d0[0], r6 @@ -2632,12 +2631,10 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NEXT: vorr q4, q0, q0 ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __aeabi_f2lz -; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: mvn r7, #0 ; CHECK-NEXT: subs r3, r0, r7 -; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: sbcs r3, r1, #0 -; CHECK-NEXT: vmov r9, s18 +; CHECK-NEXT: mov r4, #0 ; CHECK-NEXT: mov r3, #0 ; CHECK-NEXT: mov r10, #0 ; CHECK-NEXT: movwlt r3, #1 @@ -2645,12 +2642,13 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-NEXT: movne r3, r1 ; CHECK-NEXT: moveq r0, r7 ; CHECK-NEXT: rsbs r1, r0, #0 -; CHECK-NEXT: vmov r8, s17 +; CHECK-NEXT: vmov r9, s18 ; CHECK-NEXT: rscs r1, r3, #0 +; CHECK-NEXT: vmov r8, s17 ; CHECK-NEXT: movwlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: movne r4, r0 -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: bl __aeabi_f2lz ; CHECK-NEXT: subs r2, r0, r7 ; CHECK-NEXT: mov r5, #0 @@ -2971,12 +2969,10 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEON-NEXT: vmov.f32 s20, s0 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz -; CHECK-NEON-NEXT: vmov r2, s20 ; CHECK-NEON-NEXT: mvn r7, #0 ; CHECK-NEON-NEXT: subs r3, r0, r7 -; CHECK-NEON-NEXT: mov r4, #0 ; CHECK-NEON-NEXT: sbcs r3, r1, #0 -; CHECK-NEON-NEXT: vmov r8, s18 +; CHECK-NEON-NEXT: mov r4, #0 ; CHECK-NEON-NEXT: mov r3, #0 ; CHECK-NEON-NEXT: mov r10, #0 ; CHECK-NEON-NEXT: movwlt r3, #1 @@ -2984,12 +2980,13 @@ define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) { ; CHECK-NEON-NEXT: movne r3, r1 ; CHECK-NEON-NEXT: moveq r0, r7 ; CHECK-NEON-NEXT: rsbs r1, r0, #0 -; CHECK-NEON-NEXT: vmov r9, s16 +; CHECK-NEON-NEXT: vmov r8, s18 ; CHECK-NEON-NEXT: rscs r1, r3, #0 +; CHECK-NEON-NEXT: vmov r9, s16 ; CHECK-NEON-NEXT: movwlt r4, #1 ; CHECK-NEON-NEXT: cmp r4, #0 ; CHECK-NEON-NEXT: movne r4, r0 -; CHECK-NEON-NEXT: mov r0, r2 +; CHECK-NEON-NEXT: vmov r0, s20 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: bl __aeabi_f2lz ; CHECK-NEON-NEXT: subs r2, r0, r7 @@ -3257,6 +3254,7 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 +; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -3290,15 +3288,14 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 +; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: mov r0, r1 +; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov r0, s20 @@ -3499,6 +3496,7 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: vmov r0, s1 ; CHECK-NEON-NEXT: vmov.f32 s16, s7 ; CHECK-NEON-NEXT: vmov.f32 s18, s6 +; CHECK-NEON-NEXT: vmov.f32 s20, s5 ; CHECK-NEON-NEXT: vmov.f32 s22, s4 ; CHECK-NEON-NEXT: vmov.f32 s24, s3 ; CHECK-NEON-NEXT: vmov.f32 s26, s2 @@ -3532,15 +3530,14 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov s2, r5 -; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s0 -; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 +; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vcvt.s32.f32 s0, s30 -; CHECK-NEON-NEXT: vmov r1, s20 ; CHECK-NEON-NEXT: vmov.32 d8[0], r0 ; CHECK-NEON-NEXT: vmov r0, s0 ; CHECK-NEON-NEXT: vmov.32 d12[0], r0 -; CHECK-NEON-NEXT: mov r0, r1 +; CHECK-NEON-NEXT: vmov r0, s20 +; CHECK-NEON-NEXT: vcvt.s32.f32 s20, s2 ; CHECK-NEON-NEXT: bl __aeabi_h2f ; CHECK-NEON-NEXT: vmov s0, r0 ; CHECK-NEON-NEXT: vmov r0, s20 diff --git a/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir index c5a8fabfdc79b..d4446726aaaa7 100644 --- a/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir +++ b/llvm/test/CodeGen/ARM/mcp-dest-regs-no-dup.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=arm-eabi -O1 -run-pass=machine-cp %s -o - \ # RUN: -verify-machineinstrs -simplify-mir | FileCheck %s @@ -11,3 +12,5 @@ body: | renamable $r9 = COPY killed renamable $r0 ... +## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +# CHECK: {{.*}} diff --git a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll index 7f56215b9b412..7c82f8294ade3 100644 --- a/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/ARM/srem-seteq-illegal-types.ll @@ -382,14 +382,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d8[0], r0 ; ARM7-NEXT: ldr r0, [sp, #44] -; ARM7-NEXT: ldr r2, [sp, #40] ; ARM7-NEXT: mov r5, r1 +; ARM7-NEXT: mvn r2, #8 ; ARM7-NEXT: and r0, r0, #1 ; ARM7-NEXT: mvn r3, #0 ; ARM7-NEXT: rsb r1, r0, #0 +; ARM7-NEXT: ldr r0, [sp, #40] ; ARM7-NEXT: vmov.32 d9[0], r7 -; ARM7-NEXT: mov r0, r2 -; ARM7-NEXT: mvn r2, #8 ; ARM7-NEXT: bl __moddi3 ; ARM7-NEXT: vmov.32 d16[0], r0 ; ARM7-NEXT: adr r0, .LCPI3_0 @@ -458,14 +457,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d8[0], r0 ; ARM8-NEXT: ldr r0, [sp, #44] -; ARM8-NEXT: ldr r2, [sp, #40] ; ARM8-NEXT: mov r5, r1 +; ARM8-NEXT: mvn r2, #8 ; ARM8-NEXT: and r0, r0, #1 ; ARM8-NEXT: mvn r3, #0 ; ARM8-NEXT: rsb r1, r0, #0 +; ARM8-NEXT: ldr r0, [sp, #40] ; ARM8-NEXT: vmov.32 d9[0], r7 -; ARM8-NEXT: mov r0, r2 -; ARM8-NEXT: mvn r2, #8 ; ARM8-NEXT: bl __moddi3 ; ARM8-NEXT: vmov.32 d16[0], r0 ; ARM8-NEXT: adr r0, .LCPI3_0 @@ -534,14 +532,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d8[0], r0 ; NEON7-NEXT: ldr r0, [sp, #44] -; NEON7-NEXT: ldr r2, [sp, #40] ; NEON7-NEXT: mov r5, r1 +; NEON7-NEXT: mvn r2, #8 ; NEON7-NEXT: and r0, r0, #1 ; NEON7-NEXT: mvn r3, #0 ; NEON7-NEXT: rsb r1, r0, #0 +; NEON7-NEXT: ldr r0, [sp, #40] ; NEON7-NEXT: vmov.32 d9[0], r7 -; NEON7-NEXT: mov r0, r2 -; NEON7-NEXT: mvn r2, #8 ; NEON7-NEXT: bl __moddi3 ; NEON7-NEXT: vmov.32 d16[0], r0 ; NEON7-NEXT: adr r0, .LCPI3_0 @@ -610,14 +607,13 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d8[0], r0 ; NEON8-NEXT: ldr r0, [sp, #44] -; NEON8-NEXT: ldr r2, [sp, #40] ; NEON8-NEXT: mov r5, r1 +; NEON8-NEXT: mvn r2, #8 ; NEON8-NEXT: and r0, r0, #1 ; NEON8-NEXT: mvn r3, #0 ; NEON8-NEXT: rsb r1, r0, #0 +; NEON8-NEXT: ldr r0, [sp, #40] ; NEON8-NEXT: vmov.32 d9[0], r7 -; NEON8-NEXT: mov r0, r2 -; NEON8-NEXT: mvn r2, #8 ; NEON8-NEXT: bl __moddi3 ; NEON8-NEXT: vmov.32 d16[0], r0 ; NEON8-NEXT: adr r0, .LCPI3_0 diff --git a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll index c40dd2e922963..355cff220de6f 100644 --- a/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll +++ b/llvm/test/CodeGen/ARM/vecreduce-fadd-legalization-strict.ll @@ -92,21 +92,21 @@ define double @test_v1f64_neutral(<1 x double> %a) nounwind { define fp128 @test_v1f128(<1 x fp128> %a, fp128 %s) nounwind { ; CHECK-LABEL: test_v1f128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r11, lr} -; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} ; CHECK-NEXT: .pad #16 ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: str r0, [sp] ; CHECK-NEXT: str r1, [sp, #4] ; CHECK-NEXT: str r2, [sp, #8] ; CHECK-NEXT: str r3, [sp, #12] -; CHECK-NEXT: ldr r0, [sp, #32] -; CHECK-NEXT: ldr r1, [sp, #36] -; CHECK-NEXT: ldr r2, [sp, #40] -; CHECK-NEXT: ldr r3, [sp, #44] +; CHECK-NEXT: ldr r0, [sp, #24] +; CHECK-NEXT: ldr r1, [sp, #28] +; CHECK-NEXT: ldr r2, [sp, #32] +; CHECK-NEXT: ldr r3, [sp, #36] ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: add sp, sp, #16 -; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: pop {r11, lr} ; CHECK-NEXT: mov pc, lr %b = call fp128 @llvm.vector.reduce.fadd.f128.v1f128(fp128 %s, <1 x fp128> %a) ret fp128 %b diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll index abb7fff831afe..69cb84ba97d27 100644 --- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll +++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll @@ -204,8 +204,8 @@ define i64 @func2(i64 %x, i64 %y) nounwind { define i4 @func3(i4 %x, i4 %y) nounwind { ; ARM-LABEL: func3: ; ARM: @ %bb.0: -; ARM-NEXT: .save {r4, lr} -; ARM-NEXT: push {r4, lr} +; ARM-NEXT: .save {r7, lr} +; ARM-NEXT: push {r7, lr} ; ARM-NEXT: lsls r0, r0, #28 ; ARM-NEXT: lsls r1, r1, #28 ; ARM-NEXT: asrs r2, r1, #28 @@ -230,7 +230,7 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; ARM-NEXT: lsls r0, r2, #31 ; ARM-NEXT: .LBB2_5: ; ARM-NEXT: asrs r0, r0, #28 -; ARM-NEXT: pop {r4, pc} +; ARM-NEXT: pop {r7, pc} ; ARM-NEXT: .p2align 2 ; ARM-NEXT: @ %bb.6: ; ARM-NEXT: .LCPI2_0: @@ -383,8 +383,8 @@ define i64 @func5(i64 %x, i64 %y) { define i4 @func6(i4 %x, i4 %y) nounwind { ; ARM-LABEL: func6: ; ARM: @ %bb.0: -; ARM-NEXT: .save {r4, lr} -; ARM-NEXT: push {r4, lr} +; ARM-NEXT: .save {r7, lr} +; ARM-NEXT: push {r7, lr} ; ARM-NEXT: lsls r0, r0, #28 ; ARM-NEXT: lsls r1, r1, #28 ; ARM-NEXT: asrs r2, r1, #28 @@ -407,7 +407,7 @@ define i4 @func6(i4 %x, i4 %y) nounwind { ; ARM-NEXT: mov r2, r0 ; ARM-NEXT: .LBB5_5: ; ARM-NEXT: asrs r0, r2, #28 -; ARM-NEXT: pop {r4, pc} +; ARM-NEXT: pop {r7, pc} ; ARM-NEXT: .p2align 2 ; ARM-NEXT: @ %bb.6: ; ARM-NEXT: .LCPI5_0: diff --git a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll index b0cc1c6886298..9b5fa1c2bc811 100644 --- a/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/Thumb/umulo-128-legalisation-lowering.ll @@ -9,10 +9,10 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; THUMBV6-NEXT: .pad #60 ; THUMBV6-NEXT: sub sp, #60 ; THUMBV6-NEXT: mov r6, r3 -; THUMBV6-NEXT: mov r4, r0 -; THUMBV6-NEXT: str r0, [sp, #40] @ 4-byte Spill ; THUMBV6-NEXT: mov r1, r2 ; THUMBV6-NEXT: str r2, [sp, #52] @ 4-byte Spill +; THUMBV6-NEXT: mov r4, r0 +; THUMBV6-NEXT: str r0, [sp, #40] @ 4-byte Spill ; THUMBV6-NEXT: ldr r2, [sp, #88] ; THUMBV6-NEXT: str r2, [sp, #48] @ 4-byte Spill ; THUMBV6-NEXT: movs r5, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll index de4b24da27a8d..370f706e420b2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fpclamptosat_vec.ll @@ -622,23 +622,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vcvt.s32.f16 s10, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vcvt.s32.f16 s5, s3 +; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s8, s1 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vcvt.s32.f16 s14, s0 ; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vcvt.s32.f16 s8, s1 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 @@ -675,14 +675,13 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64(<2 x double> %x) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: vmov r12, lr, d8 ; CHECK-NEXT: subs.w r4, r0, #-1 ; CHECK-NEXT: mvn r9, #-2147483648 ; CHECK-NEXT: sbcs.w r4, r1, r9 -; CHECK-NEXT: sbcs r4, r2, #0 ; CHECK-NEXT: mov.w r7, #-1 -; CHECK-NEXT: sbcs r4, r3, #0 +; CHECK-NEXT: sbcs r4, r2, #0 ; CHECK-NEXT: mov.w r10, #-2147483648 +; CHECK-NEXT: sbcs r4, r3, #0 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csel r3, r3, r4, ne @@ -696,8 +695,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64(<2 x double> %x) { ; CHECK-NEXT: cset r5, lt ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csel r8, r1, r10, ne -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r1, lr +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: subs.w r6, r0, #-1 ; CHECK-NEXT: sbcs.w r6, r1, r9 @@ -737,23 +735,20 @@ entry: define arm_aapcs_vfpcc <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-LABEL: utest_f64i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r5, r6, r7, lr} +; CHECK-NEXT: push {r5, r6, r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: cset r6, lo ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r7, r0, r6, ne -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: sbcs r2, r3, #0 @@ -767,8 +762,7 @@ define arm_aapcs_vfpcc <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r5, r6, r7, pc} entry: %conv = fptoui <2 x double> %x to <2 x i128> %0 = icmp ult <2 x i128> %conv, @@ -789,12 +783,11 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: vmov r12, lr, d8 ; CHECK-NEXT: subs r4, r2, #1 -; CHECK-NEXT: sbcs r4, r3, #0 ; CHECK-NEXT: mov.w r8, #1 +; CHECK-NEXT: sbcs r4, r3, #0 +; CHECK-NEXT: mov.w r7, #0 ; CHECK-NEXT: cset r5, lt -; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csel r0, r0, r5, ne ; CHECK-NEXT: csel r3, r3, r5, ne @@ -807,8 +800,7 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-NEXT: cset r6, lt ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r9, r0, r6, ne -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r1, lr +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: subs r5, r2, #1 ; CHECK-NEXT: sbcs r5, r3, #0 @@ -1704,23 +1696,23 @@ define arm_aapcs_vfpcc <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-NEXT: vmovx.f16 s6, s0 ; CHECK-NEXT: vcvt.s32.f16 s10, s0 ; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vmovx.f16 s4, s1 -; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vcvt.s32.f16 s5, s3 +; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vmovx.f16 s0, s2 ; CHECK-NEXT: vcvt.s32.f16 s7, s2 -; CHECK-NEXT: vcvt.s32.f16 s8, s1 -; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vcvt.s32.f16 s14, s0 ; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vcvt.s32.f16 s14, s0 -; CHECK-NEXT: vcvt.s32.f16 s12, s0 +; CHECK-NEXT: vcvt.s32.f16 s8, s1 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r1 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov r2, s10 ; CHECK-NEXT: mov r0, sp ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 @@ -1755,14 +1747,13 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64_mm(<2 x double> %x) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: vmov r12, lr, d8 ; CHECK-NEXT: subs.w r4, r0, #-1 ; CHECK-NEXT: mvn r9, #-2147483648 ; CHECK-NEXT: sbcs.w r4, r1, r9 -; CHECK-NEXT: sbcs r4, r2, #0 ; CHECK-NEXT: mov.w r7, #-1 -; CHECK-NEXT: sbcs r4, r3, #0 +; CHECK-NEXT: sbcs r4, r2, #0 ; CHECK-NEXT: mov.w r10, #-2147483648 +; CHECK-NEXT: sbcs r4, r3, #0 ; CHECK-NEXT: cset r4, lt ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csel r3, r3, r4, ne @@ -1776,8 +1767,7 @@ define arm_aapcs_vfpcc <2 x i64> @stest_f64i64_mm(<2 x double> %x) { ; CHECK-NEXT: cset r5, lt ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: csel r8, r1, r10, ne -; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r1, lr +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: subs.w r6, r0, #-1 ; CHECK-NEXT: sbcs.w r6, r1, r9 @@ -1815,23 +1805,20 @@ entry: define arm_aapcs_vfpcc <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-LABEL: utest_f64i64_mm: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r5, r6, r7, lr} +; CHECK-NEXT: push {r5, r6, r7, lr} ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixunsdfti -; CHECK-NEXT: mov r5, r1 -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: cset r6, lo ; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: csel r7, r0, r6, ne -; CHECK-NEXT: mov r0, r4 +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __fixunsdfti ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: sbcs r2, r3, #0 @@ -1845,8 +1832,7 @@ define arm_aapcs_vfpcc <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-NEXT: vmov q0[2], q0[0], r0, r7 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 ; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r5, r6, r7, pc} entry: %conv = fptoui <2 x double> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -1864,18 +1850,17 @@ define arm_aapcs_vfpcc <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __fixdfti -; CHECK-NEXT: mov r8, r1 -; CHECK-NEXT: vmov r4, r1, d8 ; CHECK-NEXT: subs r2, #1 +; CHECK-NEXT: mov r8, r1 ; CHECK-NEXT: sbcs r2, r3, #0 ; CHECK-NEXT: cset r7, lt ; CHECK-NEXT: cmp r7, #0 ; CHECK-NEXT: csel r6, r0, r7, ne +; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: csel r5, r3, r7, ne ; CHECK-NEXT: cmp r5, #0 ; CHECK-NEXT: it mi ; CHECK-NEXT: movmi r6, #0 -; CHECK-NEXT: mov r0, r4 ; CHECK-NEXT: bl __fixdfti ; CHECK-NEXT: subs r2, #1 ; CHECK-NEXT: sbcs r2, r3, #0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll index aa8c618b41274..0f7f31077414a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll @@ -1379,12 +1379,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_signed_v5f16_v5i32(<5 x half> %f) { ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vcvt.s32.f16 s8, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vcvt.s32.f16 s4, s4 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 @@ -1404,11 +1404,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f16_v6i32(<6 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s10, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 ; CHECK-NEXT: vcvt.s32.f16 s4, s2 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 -; CHECK-NEXT: vcvt.s32.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vcvt.s32.f16 s8, s8 ; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 ; CHECK-NEXT: vmov r1, s6 @@ -1431,11 +1431,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f16_v7i32(<7 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s12, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 ; CHECK-NEXT: vcvt.s32.f16 s4, s2 -; CHECK-NEXT: vcvt.s32.f16 s2, s2 -; CHECK-NEXT: vcvt.s32.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vcvt.s32.f16 s8, s8 +; CHECK-NEXT: vcvt.s32.f16 s10, s10 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vcvt.s32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvt.s32.f16 s6, s3 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 @@ -1465,9 +1465,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32(<8 x half> %f) { ; CHECK-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-NEXT: vcvt.s32.f16 s2, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.s32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -2122,12 +2122,10 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: mov r9, r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: mov r10, r3 -; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vldr s22, .LCPI30_0 -; CHECK-NEXT: vmov r7, s17 +; CHECK-NEXT: mov r10, r3 ; CHECK-NEXT: vldr s20, .LCPI30_1 -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r7, s17 ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, s20 @@ -2167,7 +2165,8 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: str.w r0, [r9, #25] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r4, s19 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: vcmp.f32 s16, s22 ; CHECK-NEXT: mov r11, r3 @@ -2372,20 +2371,18 @@ define arm_aapcs_vfpcc <4 x i100> @test_signed_v4f32_v4i100(<4 x float> %f) { define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { ; CHECK-LABEL: test_signed_v4f32_v4i128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __fixsfti -; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: vldr s22, .LCPI31_0 -; CHECK-NEXT: vldr s20, .LCPI31_1 ; CHECK-NEXT: vmov r7, s16 +; CHECK-NEXT: vldr s20, .LCPI31_1 +; CHECK-NEXT: vmov r6, s17 ; CHECK-NEXT: vcmp.f32 s19, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, s20 @@ -2438,8 +2435,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { ; CHECK-NEXT: it vs ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: str r0, [r4, #48] -; CHECK-NEXT: mov r0, r5 -; CHECK-NEXT: vmov r6, s17 +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __fixsfti ; CHECK-NEXT: vcmp.f32 s18, s22 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -2602,8 +2598,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_signed_v4f32_v4i128(<4 x float> %f) { ; CHECK-NEXT: movvs r0, #0 ; CHECK-NEXT: str r0, [r4] ; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI31_0: @@ -4823,9 +4818,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_signed_v8f16_v8i32_duplicate(<8 x half> % ; CHECK-NEXT: vcvt.s32.f16 s14, s2 ; CHECK-NEXT: vcvt.s32.f16 s2, s1 ; CHECK-NEXT: vcvt.s32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vcvt.s32.f16 s6, s6 ; CHECK-NEXT: vcvt.s32.f16 s4, s4 +; CHECK-NEXT: vcvt.s32.f16 s6, s6 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.s32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll index 1849341ce72b7..f3386f27bb5e7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll @@ -1145,12 +1145,12 @@ define arm_aapcs_vfpcc <5 x i32> @test_unsigned_v5f16_v5i32(<5 x half> %f) { ; CHECK-NEXT: vmovx.f16 s4, s1 ; CHECK-NEXT: vcvt.u32.f16 s8, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vcvt.u32.f16 s4, s4 ; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vcvt.u32.f16 s4, s4 ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1 @@ -1170,11 +1170,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f16_v6i32(<6 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s10, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 ; CHECK-NEXT: vcvt.u32.f16 s4, s2 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 -; CHECK-NEXT: vcvt.u32.f16 s8, s8 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vcvt.u32.f16 s8, s8 ; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 ; CHECK-NEXT: vmov r1, s6 @@ -1197,11 +1197,11 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f16_v7i32(<7 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s12, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 ; CHECK-NEXT: vcvt.u32.f16 s4, s2 -; CHECK-NEXT: vcvt.u32.f16 s2, s2 -; CHECK-NEXT: vcvt.u32.f16 s10, s10 +; CHECK-NEXT: vmovx.f16 s2, s2 ; CHECK-NEXT: vcvt.u32.f16 s8, s8 +; CHECK-NEXT: vcvt.u32.f16 s10, s10 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmovx.f16 s2, s2 +; CHECK-NEXT: vcvt.u32.f16 s2, s2 ; CHECK-NEXT: vmov r2, s0 ; CHECK-NEXT: vcvt.u32.f16 s6, s3 ; CHECK-NEXT: vmov q3[2], q3[0], r2, r1 @@ -1231,9 +1231,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32(<8 x half> %f) { ; CHECK-NEXT: vcvt.u32.f16 s14, s2 ; CHECK-NEXT: vcvt.u32.f16 s2, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vcvt.u32.f16 s4, s4 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.u32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 @@ -1739,14 +1739,13 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: mov r8, r0 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: mov r9, r3 -; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: vldr s20, .LCPI30_0 ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r2, #0 ; CHECK-NEXT: vcmp.f32 s18, s20 +; CHECK-NEXT: mov r9, r3 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: it gt @@ -1769,9 +1768,9 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: str.w r0, [r8, #25] -; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: vmov r0, s16 ; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vmov r4, s19 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s16, #0 ; CHECK-NEXT: mov r10, r3 @@ -1922,23 +1921,20 @@ define arm_aapcs_vfpcc <4 x i100> @test_unsigned_v4f32_v4i100(<4 x float> %f) { define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) { ; CHECK-LABEL: test_unsigned_v4f32_v4i128: ; CHECK: @ %bb.0: -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .save {r4, r6, r7, lr} +; CHECK-NEXT: push {r4, r6, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10} ; CHECK-NEXT: vpush {d8, d9, d10} ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: vmov r0, s19 ; CHECK-NEXT: bl __fixunssfti -; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: vldr s20, .LCPI31_0 ; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr -; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #0 +; CHECK-NEXT: vcmp.f32 s19, s20 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr ; CHECK-NEXT: vcmp.f32 s19, #0 ; CHECK-NEXT: it gt @@ -1970,9 +1966,9 @@ define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) { ; CHECK-NEXT: it gt ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: str r0, [r4, #48] +; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: vmov r6, s17 -; CHECK-NEXT: mov r0, r5 ; CHECK-NEXT: bl __fixunssfti ; CHECK-NEXT: vcmp.f32 s18, #0 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr @@ -2087,8 +2083,7 @@ define arm_aapcs_vfpcc <4 x i128> @test_unsigned_v4f32_v4i128(<4 x float> %f) { ; CHECK-NEXT: movgt.w r0, #-1 ; CHECK-NEXT: str r0, [r4] ; CHECK-NEXT: vpop {d8, d9, d10} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: pop {r4, r6, r7, pc} ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI31_0: @@ -3735,9 +3730,9 @@ define arm_aapcs_vfpcc <8 x i32> @test_unsigned_v8f16_v8i32_duplicate(<8 x half> ; CHECK-NEXT: vcvt.u32.f16 s14, s2 ; CHECK-NEXT: vcvt.u32.f16 s2, s1 ; CHECK-NEXT: vcvt.u32.f16 s0, s0 -; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vcvt.u32.f16 s6, s6 ; CHECK-NEXT: vcvt.u32.f16 s4, s4 +; CHECK-NEXT: vcvt.u32.f16 s6, s6 +; CHECK-NEXT: vmov r0, s2 ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vcvt.u32.f16 s12, s3 ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll index e67b2fe32b7e2..fe28f785623ed 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll @@ -65,13 +65,13 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b) ; CHECK-LABEL: ext_add_trunc_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.f32 s8, s6 -; CHECK-NEXT: vmov.f32 s2, s3 ; CHECK-NEXT: vmov.f32 s6, s7 ; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.f32 s8, s3 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s2, s3 +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov.f32 s2, s5 -; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: add.w r12, r1, r0 ; CHECK-NEXT: vmov r1, s6 ; CHECK-NEXT: vmov r0, s0 diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll index d536e6b72ac9c..dc4b8c94a8c67 100644 --- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll +++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll @@ -310,13 +310,12 @@ define arm_aapcs_vfpcc <2 x double> @maxnm_float64_t(<2 x double> %src1, <2 x do ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: vmov r2, r3, d10 ; CHECK-NEXT: bl __aeabi_dcmpgt -; CHECK-NEXT: vmov r12, r1, d9 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: vmov r2, r3, d11 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: csetm r0, ne -; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: vmov r2, r3, d11 ; CHECK-NEXT: bfi r4, r0, #0, #8 -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: bl __aeabi_dcmpgt ; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: csetm r0, ne diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll index 117469f3bd788..9bac575b571a2 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-ext.ll @@ -171,12 +171,11 @@ define arm_aapcs_vfpcc <2 x i64> @sext_v2i1_v2f64(<2 x double> %src) { ; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq -; CHECK-MVE-NEXT: vmov r2, r1, d8 ; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: vmov r0, r1, d8 +; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: csetm r6, eq -; CHECK-MVE-NEXT: mov r0, r2 -; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: csetm r0, eq @@ -407,14 +406,13 @@ define arm_aapcs_vfpcc <2 x i64> @zext_v2i1_v2f64(<2 x double> %src) { ; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq -; CHECK-MVE-NEXT: vmov r2, r1, d8 -; CHECK-MVE-NEXT: adr r3, .LCPI13_1 ; CHECK-MVE-NEXT: cmp r0, #0 +; CHECK-MVE-NEXT: vmov r0, r1, d8 +; CHECK-MVE-NEXT: adr r3, .LCPI13_1 +; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: vldrw.u32 q4, [r3] ; CHECK-MVE-NEXT: mov r3, r5 ; CHECK-MVE-NEXT: csetm r6, eq -; CHECK-MVE-NEXT: mov r0, r2 -; CHECK-MVE-NEXT: mov r2, r4 ; CHECK-MVE-NEXT: bl __aeabi_dcmpeq ; CHECK-MVE-NEXT: cmp r0, #0 ; CHECK-MVE-NEXT: csetm r0, eq @@ -942,8 +940,7 @@ define arm_aapcs_vfpcc <2 x double> @uitofp_v2i1_v2f64(<2 x i64> %src) { ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sbcs.w r2, r4, r3 -; CHECK-NEXT: cset r2, lt -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: cset r0, lt ; CHECK-NEXT: bl __aeabi_ui2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -973,8 +970,7 @@ define arm_aapcs_vfpcc <2 x double> @sitofp_v2i1_v2f64(<2 x i64> %src) { ; CHECK-NEXT: vmov d9, r0, r1 ; CHECK-NEXT: rsbs r2, r2, #0 ; CHECK-NEXT: sbcs.w r2, r4, r3 -; CHECK-NEXT: csetm r2, lt -; CHECK-NEXT: mov r0, r2 +; CHECK-NEXT: csetm r0, lt ; CHECK-NEXT: bl __aeabi_i2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: vmov q0, q4 @@ -996,14 +992,13 @@ define arm_aapcs_vfpcc <2 x double> @fptoui_v2i1_v2f64(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2iz -; CHECK-NEXT: vmov r2, r1, d9 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: rsbs r0, r0, #0 -; CHECK-NEXT: adr r3, .LCPI28_0 ; CHECK-NEXT: bfi r4, r0, #0, #8 +; CHECK-NEXT: vmov r0, r1, d9 +; CHECK-NEXT: adr r3, .LCPI28_0 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl __aeabi_d2iz ; CHECK-NEXT: rsbs r0, r0, #0 ; CHECK-NEXT: bfi r4, r0, #8, #8 @@ -1034,13 +1029,12 @@ define arm_aapcs_vfpcc <2 x double> @fptosi_v2i1_v2f64(<2 x double> %src) { ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: vmov r0, r1, d8 ; CHECK-NEXT: bl __aeabi_d2iz -; CHECK-NEXT: vmov r2, r1, d9 ; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: adr r3, .LCPI29_0 ; CHECK-NEXT: bfi r4, r0, #0, #8 +; CHECK-NEXT: vmov r0, r1, d9 ; CHECK-NEXT: vmov.i32 q4, #0x0 ; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: bl __aeabi_d2iz ; CHECK-NEXT: bfi r4, r0, #8, #8 ; CHECK-NEXT: vmsr p0, r4 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index aaee97318ecd0..f4643f8c6c4a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -7,10 +7,10 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle1_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -30,10 +30,10 @@ entry: define arm_aapcs_vfpcc <4 x i32> @shuffle3_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle3_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -923,10 +923,10 @@ entry: define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) { ; CHECK-LABEL: shuffle2_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -948,10 +948,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @shuffle1_f32(<4 x float> %src) { ; CHECK-LABEL: shuffle1_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -971,10 +971,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @shuffle3_f32(<4 x float> %src) { ; CHECK-LABEL: shuffle3_f32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -1383,10 +1383,10 @@ entry: define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) { ; CHECK-LABEL: shuffle2_f64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll index de9328a3c2423..6ce7550014296 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll @@ -7,10 +7,10 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_45670123(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -21,10 +21,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_67452301(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -69,10 +69,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_u7u5u3u1(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -83,10 +83,10 @@ entry: define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_6u4u2u0u(<8 x i16> %s1, <8 x i16> %s2) { ; CHECK-LABEL: shuffle_i16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -120,10 +120,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45670123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdef89ab45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -213,10 +213,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdeu89ub4u67u123(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cdeu89ub4u67u123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -227,10 +227,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cduu8uubuu67u12u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cduu8uubuu67u12u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -241,10 +241,10 @@ entry: define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cuuuuuubuu6uuu2u(<16 x i8> %s1, <16 x i8> %s2) { ; CHECK-LABEL: shuffle_i8_cuuuuuubuu6uuu2u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -261,9 +261,9 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle_i8_cdef89ab45u700123(<16 x i8> %s1, <1 ; CHECK-NEXT: vmov.8 q1[9], r0 ; CHECK-NEXT: vmov.u8 r0, q0[0] ; CHECK-NEXT: vmov.8 q1[11], r0 -; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov.f32 s4, s3 ; CHECK-NEXT: vmov.f32 s5, s2 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -278,10 +278,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_45670123(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_45670123: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vmov.f32 s6, s0 ; CHECK-NEXT: vmov.f32 s5, s3 +; CHECK-NEXT: vmov.f32 s7, s1 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -292,10 +292,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_67452301(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_67452301: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -340,10 +340,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_u7u5u3u1(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_u7u5u3u1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: @@ -354,10 +354,10 @@ entry: define arm_aapcs_vfpcc <8 x half> @shuffle_f16_6u4u2u0u(<8 x half> %s1, <8 x half> %s2) { ; CHECK-LABEL: shuffle_f16_6u4u2u0u: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s4, s3 +; CHECK-NEXT: vmov.f32 s5, s2 ; CHECK-NEXT: vmov.f32 s6, s1 +; CHECK-NEXT: vmov.f32 s7, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll index 04dcc77e9f937..1279714b5a78c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll @@ -374,17 +374,17 @@ define void @vabd_loop_s32(ptr nocapture readonly %x, ptr nocapture readonly %y, ; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16 -; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s8, s6 ; CHECK-NEXT: vmov r7, s4 ; CHECK-NEXT: vmov.f32 s6, s7 -; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmov r3, s8 ; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov.f32 s12, s10 ; CHECK-NEXT: vmov.f32 s10, s5 ; CHECK-NEXT: vmov.f32 s14, s11 -; CHECK-NEXT: subs.w r8, r3, r4 +; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: asr.w r12, r3, #31 +; CHECK-NEXT: subs.w r8, r3, r4 ; CHECK-NEXT: sbc.w r12, r12, r4, asr #31 ; CHECK-NEXT: vmov r4, s10 ; CHECK-NEXT: vmov.f32 s10, s9 diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll index 82db1a95037a9..a5725a2a30048 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll @@ -18,10 +18,10 @@ entry: define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { ; CHECK-LABEL: fpext_8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vcvtt.f32.f16 s9, s0 +; CHECK-NEXT: vcvtt.f32.f16 s11, s1 ; CHECK-NEXT: vcvtb.f32.f16 s10, s1 +; CHECK-NEXT: vcvtt.f32.f16 s9, s0 ; CHECK-NEXT: vcvtb.f32.f16 s8, s0 -; CHECK-NEXT: vcvtt.f32.f16 s11, s1 ; CHECK-NEXT: vcvtt.f32.f16 s7, s3 ; CHECK-NEXT: vcvtb.f32.f16 s6, s3 ; CHECK-NEXT: vcvtt.f32.f16 s5, s2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll index 271beac139288..b49f19e55c895 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll @@ -10,15 +10,17 @@ define void @vld4_v2i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: vmov.f32 s10, s7 ; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov.f32 s6, s5 ; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmov.f32 s8, s3 ; CHECK-NEXT: vmov.f32 s12, s1 ; CHECK-NEXT: vmov r0, s10 ; CHECK-NEXT: add r0, r2 +; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: add.w r12, r2, r0 ; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: add r2, r3 ; CHECK-NEXT: vmov r3, s12 diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index 8e6e0191e2670..b005cb92dc516 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -330,8 +330,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_t2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; @@ -420,8 +420,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2) { ; CHECK-LABEL: vmovn32_b4: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s7, s3 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr ; diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll index 176246427e64c..d107edbc4c909 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll @@ -131,19 +131,18 @@ define void @vst4_v16i32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] @@ -897,19 +896,18 @@ define void @vst4_v16f32(ptr %src, ptr %dst) { ; CHECK-NEXT: vldrw.u32 q3, [r0, #192] ; CHECK-NEXT: vldrw.u32 q1, [r0, #64] ; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] ; CHECK-NEXT: add r2, sp, #128 ; CHECK-NEXT: vmov q7, q5 ; CHECK-NEXT: vldrw.u32 q3, [r0, #224] -; CHECK-NEXT: vldrw.u32 q1, [r0, #96] ; CHECK-NEXT: vldrw.u32 q5, [r0, #112] ; CHECK-NEXT: vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill ; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload ; CHECK-NEXT: add r2, sp, #64 +; CHECK-NEXT: vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [r0, #96] ; CHECK-NEXT: vldrw.u32 q4, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: vst40.32 {q0, q1, q2, q3}, [r1] diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll index bad1bf15c145f..297667cf979ad 100644 --- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll @@ -65,9 +65,9 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(ptr nocapture readonly %pSrc, i32 ; CHECK-NEXT: vmov.f32 s0, s18 ; CHECK-NEXT: vmov.f32 s2, s19 ; CHECK-NEXT: vmov d9, r0, r1 +; CHECK-NEXT: vand q5, q0, q5 ; CHECK-NEXT: vmov r0, r1, d12 ; CHECK-NEXT: vmov r4, r5, d11 -; CHECK-NEXT: vand q5, q0, q5 ; CHECK-NEXT: bl __aeabi_ul2d ; CHECK-NEXT: vmov d8, r0, r1 ; CHECK-NEXT: mov r0, r4 diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll index f9d6663b5b8a3..ded4134589c0f 100644 --- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll +++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll @@ -19,14 +19,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: movq 24(%rdi), %r29 ; EGPR-NEXT: movq 16(%rdi), %r17 ; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NEXT: movq 32(%rdi), %r10 -; EGPR-NEXT: movq 56(%rdi), %r15 -; EGPR-NEXT: movq 40(%rdi), %rdi -; EGPR-NEXT: movq 48(%r24), %r12 ; EGPR-NEXT: movq 24(%rsi), %r23 ; EGPR-NEXT: movq 16(%rsi), %r11 ; EGPR-NEXT: movq (%rsi), %r27 ; EGPR-NEXT: movq 8(%rsi), %r14 +; EGPR-NEXT: movq 40(%rdi), %rsi +; EGPR-NEXT: movq 32(%rdi), %r10 +; EGPR-NEXT: movq 56(%rdi), %r15 +; EGPR-NEXT: movq 48(%rdi), %r12 ; EGPR-NEXT: movq %r12, %rax ; EGPR-NEXT: mulq %r27 ; EGPR-NEXT: movq %rdx, %r8 @@ -55,7 +55,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: mulq %r27 ; EGPR-NEXT: movq %rdx, %r20 ; EGPR-NEXT: movq %rax, %r25 -; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: movq %rsi, %rax ; EGPR-NEXT: mulq %r27 ; EGPR-NEXT: movq %rdx, %r21 ; EGPR-NEXT: movq %rax, %r22 @@ -69,7 +69,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: adcq %r21, %r20 ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx -; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: movq %rsi, %rax ; EGPR-NEXT: mulq %r14 ; EGPR-NEXT: movq %rdx, %r21 ; EGPR-NEXT: movq %rax, %r22 @@ -84,8 +84,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: mulq %r11 ; EGPR-NEXT: movq %rdx, %r8 ; EGPR-NEXT: movq %rax, %r30 -; EGPR-NEXT: movq %rdi, %rax -; EGPR-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NEXT: movq %rsi, %rax +; EGPR-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NEXT: mulq %r11 ; EGPR-NEXT: movq %rdx, %r19 ; EGPR-NEXT: movq %rax, %r20 @@ -99,7 +99,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NEXT: adcq %r19, %rbx ; EGPR-NEXT: setb %al ; EGPR-NEXT: movzbl %al, %ecx -; EGPR-NEXT: movq %rdi, %rax +; EGPR-NEXT: movq %rsi, %rax ; EGPR-NEXT: mulq %r23 ; EGPR-NEXT: movq %rdx, %r26 ; EGPR-NEXT: movq %rax, %r8 @@ -1295,22 +1295,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind { ; EGPR-NDD-NEXT: adcq %rcx, %r29, %r8 ; EGPR-NDD-NEXT: adcq $0, %rdi ; EGPR-NDD-NEXT: adcq $0, %rsi, %r9 -; EGPR-NDD-NEXT: movq %r11, %r14 -; EGPR-NDD-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; EGPR-NDD-NEXT: movq 48(%r11), %r11 -; EGPR-NDD-NEXT: movq %r10, %rax -; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq 48(%r15), %r11 ; EGPR-NDD-NEXT: movq %r17, %rsi ; EGPR-NDD-NEXT: movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: movq %r17, %rax ; EGPR-NDD-NEXT: mulq %r11 -; EGPR-NDD-NEXT: movq %rax, %r29 -; EGPR-NDD-NEXT: movq %r10, %rsi ; EGPR-NDD-NEXT: movq %rdx, %r28 -; EGPR-NDD-NEXT: movq %r16, %rax -; EGPR-NDD-NEXT: movq %r16, %r10 -; EGPR-NDD-NEXT: movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; EGPR-NDD-NEXT: movq %rax, %r29 +; EGPR-NDD-NEXT: movq %r10, %rax +; EGPR-NDD-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; EGPR-NDD-NEXT: mulq %r11 ; EGPR-NDD-NEXT: addq %rax, %r28 ; EGPR-NDD-NEXT: adcq $0, %rdx, %rcx diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index ab48bc292a40c..3fb994cdb751a 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -332,11 +332,11 @@ define void @store_i256(ptr %ptr, i256 %v) { ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: subq $40, %rsp ; CHECK-O3-NEXT: .cfi_def_cfa_offset 48 -; CHECK-O3-NEXT: movq %rsi, (%rsp) ; CHECK-O3-NEXT: movq %rdi, %rax ; CHECK-O3-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: movq %rsi, (%rsp) ; CHECK-O3-NEXT: movq %rsp, %rdx ; CHECK-O3-NEXT: movl $32, %edi ; CHECK-O3-NEXT: movq %rax, %rsi diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 5d97a56ce12c6..b39b089faa2a5 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -937,17 +937,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kmovw %k1, %r12d ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r13d +; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: movb %dl, 2(%rax) ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: andl $1, %edx ; KNL-NEXT: andl $1, %r9d -; KNL-NEXT: kshiftrw $14, %k0, %k1 ; KNL-NEXT: leal (%rdx,%r9,2), %r9d ; KNL-NEXT: kmovw %k1, %edx +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: andl $1, %r8d ; KNL-NEXT: leal (%r9,%r8,4), %r9d -; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kmovw %k0, %r8d ; KNL-NEXT: andl $1, %esi ; KNL-NEXT: leal (%r9,%rsi,8), %esi @@ -1250,17 +1250,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kmovd %k1, %r12d ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r13d +; SKX-NEXT: kshiftrd $14, %k0, %k1 ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: movb %dl, 2(%rax) ; SKX-NEXT: kmovd %k0, %edx ; SKX-NEXT: andl $1, %edx ; SKX-NEXT: andl $1, %r9d -; SKX-NEXT: kshiftrd $14, %k0, %k1 ; SKX-NEXT: leal (%rdx,%r9,2), %r9d ; SKX-NEXT: kmovd %k1, %edx +; SKX-NEXT: kshiftrd $15, %k0, %k0 ; SKX-NEXT: andl $1, %r8d ; SKX-NEXT: leal (%r9,%r8,4), %r9d -; SKX-NEXT: kshiftrd $15, %k0, %k0 ; SKX-NEXT: kmovd %k0, %r8d ; SKX-NEXT: andl $1, %esi ; SKX-NEXT: leal (%r9,%rsi,8), %esi @@ -1563,56 +1563,56 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: kmovw %k1, %edx ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ecx +; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: movb %bl, 2(%eax) ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: andl $1, %ebp -; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 ; KNL_X32-NEXT: leal (%ebx,%ebp,2), %ebx ; KNL_X32-NEXT: kmovw %k1, %ebp +; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 ; KNL_X32-NEXT: andl $1, %esi ; KNL_X32-NEXT: leal (%ebx,%esi,4), %ebx -; KNL_X32-NEXT: kshiftrw $7, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi ; KNL_X32-NEXT: leal (%ebx,%edi,8), %ebx -; KNL_X32-NEXT: kshiftrw $8, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $4, %edx ; KNL_X32-NEXT: orl %ebx, %edx -; KNL_X32-NEXT: kshiftrw $9, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ecx ; KNL_X32-NEXT: shll $5, %ecx ; KNL_X32-NEXT: orl %edx, %ecx -; KNL_X32-NEXT: kshiftrw $10, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $6, %ebp ; KNL_X32-NEXT: andl $1, %esi ; KNL_X32-NEXT: shll $7, %esi -; KNL_X32-NEXT: kshiftrw $11, %k0, %k1 ; KNL_X32-NEXT: orl %ebp, %esi ; KNL_X32-NEXT: kmovw %k1, %ebp +; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edi ; KNL_X32-NEXT: shll $8, %edi ; KNL_X32-NEXT: orl %esi, %edi -; KNL_X32-NEXT: kshiftrw $12, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %esi +; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 ; KNL_X32-NEXT: andl $1, %ebx ; KNL_X32-NEXT: shll $9, %ebx ; KNL_X32-NEXT: orl %edi, %ebx -; KNL_X32-NEXT: kshiftrw $13, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edi +; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: andl $1, %edx ; KNL_X32-NEXT: shll $10, %edx ; KNL_X32-NEXT: orl %ebx, %edx -; KNL_X32-NEXT: kshiftrw $14, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ebx -; KNL_X32-NEXT: orl %ecx, %edx ; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: orl %ecx, %edx ; KNL_X32-NEXT: kmovw %k0, %ecx ; KNL_X32-NEXT: andl $1, %ebp ; KNL_X32-NEXT: shll $11, %ebp @@ -1891,17 +1891,17 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kmovd %k1, %r12d ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r13d +; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: movb %dl, 2(%rax) ; FASTISEL-NEXT: kmovd %k0, %edx ; FASTISEL-NEXT: andl $1, %edx ; FASTISEL-NEXT: andl $1, %r9d -; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 ; FASTISEL-NEXT: leal (%rdx,%r9,2), %r9d ; FASTISEL-NEXT: kmovd %k1, %edx +; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 ; FASTISEL-NEXT: andl $1, %r8d ; FASTISEL-NEXT: leal (%r9,%r8,4), %r9d -; FASTISEL-NEXT: kshiftrd $15, %k0, %k0 ; FASTISEL-NEXT: kmovd %k0, %r8d ; FASTISEL-NEXT: andl $1, %esi ; FASTISEL-NEXT: leal (%r9,%rsi,8), %esi @@ -3113,22 +3113,22 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: kmovw %k1, %eax ; KNL_X32-NEXT: kshiftrw $1, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: andb $1, %dl ; KNL_X32-NEXT: addb %dl, %dl -; KNL_X32-NEXT: kshiftrw $2, %k0, %k1 ; KNL_X32-NEXT: orb %bl, %dl ; KNL_X32-NEXT: kmovw %k1, %ebx +; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: shlb $2, %bl ; KNL_X32-NEXT: orb %dl, %bl -; KNL_X32-NEXT: kshiftrw $3, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %edx +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: andb $1, %dl ; KNL_X32-NEXT: shlb $3, %dl ; KNL_X32-NEXT: orb %bl, %dl -; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: shlb $4, %bl diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index 2cace3060def4..1e6b30400d819 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -10,18 +10,18 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> ; X86BW: # %bb.0: ; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04] -; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x03] +; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_128: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xc9,0x04] -; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xc1,0x03] +; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_128: @@ -69,18 +69,18 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> ; X86BW: # %bb.0: ; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04] -; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03] +; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_256: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xc9,0x04] -; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xc1,0x03] +; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_256: @@ -135,18 +135,18 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> ; X86BW: # %bb.0: ; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04] -; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03] +; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineinvqb_512: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xc9,0x04] -; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xc1,0x03] +; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineinvqb_512: @@ -221,18 +221,18 @@ define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %s ; X86BW: # %bb.0: ; X86BW-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04] -; X86BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x03] +; X86BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_128: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xc9,0x04] -; X64BW-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xc1,0x03] +; X64BW-NEXT: vmovdqa %xmm4, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_128: @@ -280,18 +280,18 @@ define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %s ; X86BW: # %bb.0: ; X86BW-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04] -; X86BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03] +; X86BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_256: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xc9,0x04] -; X64BW-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xc1,0x03] +; X64BW-NEXT: vmovdqa %ymm4, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_256: @@ -346,18 +346,18 @@ define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %s ; X86BW: # %bb.0: ; X86BW-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] -; X86BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X86BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04] -; X86BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X86BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] +; X86BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03] +; X86BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X86BW-NEXT: retl # encoding: [0xc3] ; ; X64BW-LABEL: test_vgf2p8affineqb_512: ; X64BW: # %bb.0: ; X64BW-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64BW-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05] -; X64BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X64BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xc9,0x04] -; X64BW-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] +; X64BW-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04] +; X64BW-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xc1,0x03] +; X64BW-NEXT: vmovdqa64 %zmm4, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcc] ; X64BW-NEXT: retq # encoding: [0xc3] ; ; X86NOBW-LABEL: test_vgf2p8affineqb_512: diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 113828ae54ccd..2a77d0238721c 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -313,10 +313,10 @@ define i16 @test15(ptr%addr) nounwind { define i16 @test16(ptr%addr, i16 %a) nounwind { ; KNL-LABEL: test16: ; KNL: ## %bb.0: +; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movw $-1025, %cx ## imm = 0xFBFF ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -349,10 +349,10 @@ define i16 @test16(ptr%addr, i16 %a) nounwind { define i8 @test17(ptr%addr, i8 %a) nounwind { ; KNL-LABEL: test17: ; KNL: ## %bb.0: +; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: movw $-17, %cx ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -843,12 +843,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-LABEL: test_insertelement_v32i1: ; KNL: ## %bb.0: ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0 -; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: kmovw %k0, %ecx +; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: movw $-17, %dx ; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 @@ -862,12 +862,12 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-LABEL: test_insertelement_v32i1: ; SKX: ## %bb.0: ; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 ; SKX-NEXT: movl $-17, %ecx ; SKX-NEXT: kmovd %ecx, %k1 -; SKX-NEXT: setb %al ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftld $31, %k1, %k1 @@ -889,10 +889,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 ; KNL-NEXT: cmpl %esi, %edi +; KNL-NEXT: setb %al ; KNL-NEXT: movw $-5, %cx ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 {%k1} -; KNL-NEXT: setb %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 @@ -905,10 +905,10 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-LABEL: test_iinsertelement_v4i1: ; SKX: ## %bb.0: ; SKX-NEXT: cmpl %esi, %edi +; SKX-NEXT: setb %al ; SKX-NEXT: movb $-5, %cl ; SKX-NEXT: kmovd %ecx, %k1 ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 {%k1} -; SKX-NEXT: setb %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 373bc00d004bd..9e689341f7b88 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1266,10 +1266,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi +; KNL-NEXT: setg %al ; KNL-NEXT: movw $-33, %cx ; KNL-NEXT: kmovw %ecx, %k4 ; KNL-NEXT: kandw %k4, %k0, %k0 -; KNL-NEXT: setg %al ; KNL-NEXT: kmovw %eax, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $10, %k4, %k4 @@ -1291,10 +1291,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi +; SKX-NEXT: setg %al ; SKX-NEXT: movq $-33, %rcx ; SKX-NEXT: kmovq %rcx, %k1 ; SKX-NEXT: kandq %k1, %k0, %k0 -; SKX-NEXT: setg %al ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlq $63, %k1, %k1 ; SKX-NEXT: kshiftrq $58, %k1, %k1 @@ -1306,10 +1306,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi +; AVX512BW-NEXT: setg %al ; AVX512BW-NEXT: movq $-33, %rcx ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: kandq %k1, %k0, %k0 -; AVX512BW-NEXT: setg %al ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 ; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 @@ -1329,10 +1329,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi +; AVX512DQ-NEXT: setg %al ; AVX512DQ-NEXT: movw $-33, %cx ; AVX512DQ-NEXT: kmovw %ecx, %k4 ; AVX512DQ-NEXT: kandw %k4, %k1, %k1 -; AVX512DQ-NEXT: setg %al ; AVX512DQ-NEXT: kmovw %eax, %k4 ; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 ; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 @@ -1355,11 +1355,11 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: setg %al ; X86-NEXT: kshiftrq $6, %k0, %k1 ; X86-NEXT: kshiftlq $6, %k1, %k1 ; X86-NEXT: kshiftlq $59, %k0, %k0 ; X86-NEXT: kshiftrq $59, %k0, %k0 -; X86-NEXT: setg %al ; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: kmovd %eax, %k1 ; X86-NEXT: kshiftlq $63, %k1, %k1 diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9327ee800af19..727e9ccfe27d5 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -1959,9 +1959,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] ; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] +; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05] @@ -2134,9 +2134,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m ; X86-NEXT: addl %esi, %edx # encoding: [0x01,0xf2] ; X86-NEXT: adcl %ecx, %eax # encoding: [0x11,0xc8] ; X86-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04] +; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0] ; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6] -; X86-NEXT: kshiftrq $32, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x31,0xd0,0x20] ; X86-NEXT: kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2] ; X86-NEXT: adcl %eax, %edx # encoding: [0x11,0xc2] ; X86-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05] diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 21c26f3cd78ba..c0bb0037923dc 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -10720,9 +10720,9 @@ declare i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32>, <4 x i32>,i8) define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_d_128: ; X86: # %bb.0: +; X86-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10779,9 +10779,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64>, <2 x i64>, i8) define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_q_128: ; X86: # %bb.0: +; X86-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10810,9 +10810,9 @@ declare i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64>, <4 x i64>, i8) define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestm_q_256: ; X86: # %bb.0: +; X86-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10843,9 +10843,9 @@ declare i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32>, <4 x i32>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_d_128: ; X86: # %bb.0: +; X86-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10902,9 +10902,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64>, <2 x i64>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_q_128: ; X86: # %bb.0: +; X86-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] @@ -10933,9 +10933,9 @@ declare i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64>, <4 x i64>, i8 %x2) define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_ptestnm_q_256: ; X86: # %bb.0: +; X86-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 386b63d551988..edf12d219c452 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -184,10 +184,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sarl $31, %eax ; X86-NEXT: xorl %eax, %esi ; X86-NEXT: movl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl %ecx, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %ecx diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 8a1d247543cf9..c8093c820e6a3 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -201,21 +201,19 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %edi, %edi -; X86-NEXT: movl %edi, %ebx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsrl %eax, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebp, %ebp -; X86-NEXT: movl %esi, %edi ; X86-NEXT: xorl $31, %ebp ; X86-NEXT: orl $32, %ebp ; X86-NEXT: testl %eax, %eax ; X86-NEXT: cmovnel %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl $64, %ebp -; X86-NEXT: movl %edi, %edx -; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: cmovnel %ecx, %ebp ; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: movl %esi, %ebx diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll index c10a1ae9b8fb7..8767ea8a81beb 100644 --- a/llvm/test/CodeGen/X86/extract-bits.ll +++ b/llvm/test/CodeGen/X86/extract-bits.ll @@ -232,10 +232,9 @@ define i32 @bextr32_a2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -302,10 +301,9 @@ define i32 @bextr32_a3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: decl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -439,10 +437,9 @@ define i32 @bextr32_a5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %ecx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %edi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $1, %esi -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: decl %esi ; X86-NOBMI-NEXT: andl %edi, %esi @@ -2344,10 +2341,9 @@ define i32 @bextr32_b2_load(ptr %w, i32 %numskipbits, i32 %numlowbits) nounwind ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -2414,10 +2410,9 @@ define i32 @bextr32_b3_load_indexzext(ptr %w, i8 zeroext %numskipbits, i8 zeroex ; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl (%eax), %esi -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %esi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %eax -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %eax ; X86-NOBMI-NEXT: notl %eax ; X86-NOBMI-NEXT: andl %esi, %eax @@ -2551,10 +2546,9 @@ define i32 @bextr32_b5_skipextrauses(i32 %val, i32 %numskipbits, i32 %numlowbits ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NOBMI-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOBMI-NEXT: movl %eax, %ecx -; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %edx ; X86-NOBMI-NEXT: shrl %cl, %edi +; X86-NOBMI-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NOBMI-NEXT: movl $-1, %esi -; X86-NOBMI-NEXT: movl %edx, %ecx ; X86-NOBMI-NEXT: shll %cl, %esi ; X86-NOBMI-NEXT: notl %esi ; X86-NOBMI-NEXT: andl %edi, %esi diff --git a/llvm/test/CodeGen/X86/legalize-shl-vec.ll b/llvm/test/CodeGen/X86/legalize-shl-vec.ll index ed09823b2b515..5ed5c4515b303 100644 --- a/llvm/test/CodeGen/X86/legalize-shl-vec.ll +++ b/llvm/test/CodeGen/X86/legalize-shl-vec.ll @@ -80,12 +80,12 @@ define <2 x i256> @test_srl(<2 x i256> %In) nounwind { ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shldl $28, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $28, %ebx, %edx ; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shldl $28, %ecx, %ebx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: shldl $28, %edi, %esi @@ -162,12 +162,12 @@ define <2 x i256> @test_sra(<2 x i256> %In) nounwind { ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: shldl $26, %edx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $26, %ebx, %edx ; X86-NEXT: movl %edx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: shldl $26, %ecx, %ebx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: shldl $26, %edi, %esi diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 1f4aa669a67e5..c44aa8e61db47 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -1263,14 +1263,15 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: movapd %xmm4, %xmm14 ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm10, %xmm14 +; SSE-NEXT: movapd %xmm6, %xmm4 ; SSE-NEXT: mulpd %xmm6, %xmm13 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE-NEXT: mulpd %xmm6, %xmm8 ; SSE-NEXT: movapd %xmm7, %xmm10 ; SSE-NEXT: mulpd %xmm8, %xmm10 ; SSE-NEXT: addpd %xmm13, %xmm10 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: mulpd %xmm6, %xmm8 ; SSE-NEXT: addpd %xmm14, %xmm8 ; SSE-NEXT: movapd %xmm12, %xmm13 ; SSE-NEXT: unpcklpd {{.*#+}} xmm13 = xmm13[0],xmm12[0] @@ -1288,8 +1289,8 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun ; SSE-NEXT: movapd %xmm5, %xmm14 ; SSE-NEXT: mulpd %xmm13, %xmm14 ; SSE-NEXT: addpd %xmm12, %xmm14 -; SSE-NEXT: mulpd %xmm6, %xmm13 -; SSE-NEXT: movapd %xmm6, %xmm2 +; SSE-NEXT: mulpd %xmm4, %xmm13 +; SSE-NEXT: movapd %xmm4, %xmm2 ; SSE-NEXT: addpd %xmm15, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1,1] ; SSE-NEXT: movapd %xmm7, %xmm12 @@ -1653,29 +1654,28 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[1,1] ; SSE-NEXT: movaps %xmm3, %xmm10 ; SSE-NEXT: movaps %xmm3, %xmm12 -; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: mulps %xmm0, %xmm10 -; SSE-NEXT: addps %xmm5, %xmm10 ; SSE-NEXT: mulps %xmm2, %xmm0 +; SSE-NEXT: addps %xmm15, %xmm0 ; SSE-NEXT: movaps %xmm14, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[2,2] ; SSE-NEXT: movaps %xmm4, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: movaps %xmm4, %xmm15 ; SSE-NEXT: mulps %xmm1, %xmm2 -; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm13 ; SSE-NEXT: addps %xmm0, %xmm2 -; SSE-NEXT: mulps %xmm11, %xmm1 -; SSE-NEXT: addps %xmm10, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,3,3,3] ; SSE-NEXT: movaps %xmm7, %xmm3 ; SSE-NEXT: mulps %xmm14, %xmm3 -; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: mulps %xmm6, %xmm14 ; SSE-NEXT: addps %xmm2, %xmm14 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: addps %xmm5, %xmm10 +; SSE-NEXT: movaps %xmm4, %xmm15 +; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: mulps %xmm11, %xmm1 +; SSE-NEXT: addps %xmm10, %xmm1 +; SSE-NEXT: addps %xmm1, %xmm3 ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm5[0,0] -; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: addps %xmm14, %xmm2 ; SSE-NEXT: mulps {{[0-9]+}}(%rsp), %xmm1 @@ -1913,9 +1913,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm4, %xmm7 ; SSE-NEXT: mulps %xmm0, %xmm7 ; SSE-NEXT: addps %xmm1, %xmm7 -; SSE-NEXT: movaps %xmm12, %xmm15 -; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: movaps %xmm6, %xmm3 +; SSE-NEXT: mulps %xmm6, %xmm0 ; SSE-NEXT: addps %xmm2, %xmm0 ; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: movaps %xmm4, %xmm1 @@ -1956,6 +1955,7 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; SSE-NEXT: mulps %xmm1, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm14 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,1],xmm0[1,1] +; SSE-NEXT: movaps %xmm12, %xmm15 ; SSE-NEXT: movaps %xmm12, %xmm13 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: mulps %xmm14, %xmm15 @@ -2797,9 +2797,8 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vmulps %ymm1, %ymm10, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm1 -; AVX512F-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload ; AVX512F-NEXT: vbroadcastss %xmm1, %ymm1 +; AVX512F-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vmovshdup {{.*#+}} ymm1 = ymm13[1,1,3,3,5,5,7,7] ; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -2813,9 +2812,9 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX512F-NEXT: vmulps %ymm1, %ymm8, %ymm1 ; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm14, %zmm2 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm12, %zmm3 -; AVX512F-NEXT: vmovaps %zmm4, %zmm0 +; AVX512F-NEXT: vinsertf64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 32-byte Folded Reload +; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm14, %zmm2 ; AVX512F-NEXT: vmovaps %zmm5, %zmm1 ; AVX512F-NEXT: retq ; @@ -3528,16 +3527,16 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: movapd %xmm15, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm3, %xmm4 -; SSE-NEXT: movapd %xmm10, %xmm5 -; SSE-NEXT: movapd %xmm13, %xmm3 -; SSE-NEXT: mulpd %xmm1, %xmm5 -; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd %xmm13, %xmm8 +; SSE-NEXT: movapd %xmm13, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: movapd %xmm10, %xmm5 ; SSE-NEXT: movapd %xmm10, %xmm15 +; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm3, %xmm5 ; SSE-NEXT: movapd %xmm12, %xmm10 ; SSE-NEXT: mulpd %xmm12, %xmm0 +; SSE-NEXT: movapd %xmm14, %xmm9 ; SSE-NEXT: mulpd %xmm14, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 @@ -3587,24 +3586,24 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: addpd %xmm6, %xmm0 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 +; SSE-NEXT: addpd %xmm5, %xmm4 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 ; SSE-NEXT: mulpd %xmm1, %xmm5 ; SSE-NEXT: addpd %xmm7, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 ; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; SSE-NEXT: movapd %xmm14, %xmm9 ; SSE-NEXT: addpd %xmm3, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movapd %xmm7, %xmm3 +; SSE-NEXT: unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm7[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: mulpd %xmm7, %xmm2 +; SSE-NEXT: mulpd %xmm3, %xmm2 ; SSE-NEXT: addpd %xmm1, %xmm2 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: mulpd %xmm7, %xmm1 +; SSE-NEXT: mulpd %xmm3, %xmm1 ; SSE-NEXT: addpd %xmm5, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: mulpd %xmm7, %xmm5 +; SSE-NEXT: mulpd %xmm3, %xmm5 ; SSE-NEXT: addpd %xmm4, %xmm5 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm4 ; SSE-NEXT: mulpd %xmm4, %xmm3 @@ -3749,6 +3748,7 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 ; SSE-NEXT: mulpd %xmm0, %xmm11 +; SSE-NEXT: movapd %xmm13, %xmm6 ; SSE-NEXT: movapd %xmm13, %xmm4 ; SSE-NEXT: mulpd %xmm1, %xmm4 ; SSE-NEXT: addpd %xmm11, %xmm4 @@ -3792,7 +3792,6 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: unpcklpd {{.*#+}} xmm9 = xmm9[0],xmm1[0] ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm3 ; SSE-NEXT: mulpd %xmm9, %xmm3 -; SSE-NEXT: movapd %xmm13, %xmm6 ; SSE-NEXT: addpd %xmm0, %xmm3 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm10 ; SSE-NEXT: mulpd %xmm9, %xmm10 @@ -3854,20 +3853,20 @@ define <64 x double> @test_mul8x8_f64(<64 x double> %a0, <64 x double> %a1) noun ; SSE-NEXT: mulpd %xmm1, %xmm2 ; SSE-NEXT: addpd %xmm3, %xmm2 ; SSE-NEXT: movapd %xmm6, %xmm7 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: mulpd %xmm1, %xmm7 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; SSE-NEXT: movapd %xmm6, %xmm9 +; SSE-NEXT: mulpd %xmm1, %xmm9 +; SSE-NEXT: mulpd %xmm5, %xmm1 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; SSE-NEXT: movapd %xmm14, %xmm3 ; SSE-NEXT: mulpd %xmm0, %xmm3 +; SSE-NEXT: addpd %xmm3, %xmm7 +; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: movapd %xmm4, %xmm3 -; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; SSE-NEXT: addpd %xmm4, %xmm7 ; SSE-NEXT: mulpd %xmm0, %xmm3 -; SSE-NEXT: movapd %xmm6, %xmm9 -; SSE-NEXT: mulpd %xmm1, %xmm9 ; SSE-NEXT: addpd %xmm3, %xmm9 ; SSE-NEXT: mulpd %xmm8, %xmm0 -; SSE-NEXT: mulpd %xmm5, %xmm1 ; SSE-NEXT: addpd %xmm0, %xmm1 ; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 ; SSE-NEXT: movapd %xmm0, %xmm10 diff --git a/llvm/test/CodeGen/X86/mul-i256.ll b/llvm/test/CodeGen/X86/mul-i256.ll index 65bd1df582104..80a1a2a0554cc 100644 --- a/llvm/test/CodeGen/X86/mul-i256.ll +++ b/llvm/test/CodeGen/X86/mul-i256.ll @@ -305,13 +305,13 @@ define void @test(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: movq 8(%rdi), %r11 ; X64-NEXT: movq 16(%rdi), %r10 ; X64-NEXT: movq (%rsi), %r9 -; X64-NEXT: movq 8(%rsi), %r14 ; X64-NEXT: movq 24(%rdi), %r15 -; X64-NEXT: imulq %r9, %r15 ; X64-NEXT: movq %r9, %rax ; X64-NEXT: mulq %r10 -; X64-NEXT: movq 16(%rsi), %r8 ; X64-NEXT: movq %rax, %rdi +; X64-NEXT: movq 16(%rsi), %r8 +; X64-NEXT: movq 8(%rsi), %r14 +; X64-NEXT: imulq %r9, %r15 ; X64-NEXT: imulq %r14, %r10 ; X64-NEXT: addq %rdx, %r10 ; X64-NEXT: addq %r15, %r10 diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll index 355f770fd1704..64f6746e616ed 100644 --- a/llvm/test/CodeGen/X86/mul-i512.ll +++ b/llvm/test/CodeGen/X86/mul-i512.ll @@ -1218,11 +1218,11 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind { ; X64-NEXT: addq %rcx, %r9 ; X64-NEXT: adcq %rsi, %rdx ; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rbx, %rax ; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: mulq %r8 ; X64-NEXT: movq %rdx, %rcx -; X64-NEXT: movq %rbx, %rsi ; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %r8 diff --git a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll index d7b8734e96e39..4d81193e1b2e5 100644 --- a/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll +++ b/llvm/test/CodeGen/X86/peephole-na-phys-copy-folding.ll @@ -273,7 +273,6 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 ; CHECK32: # %bb.0: # %entry ; CHECK32-NEXT: pushl %ebp ; CHECK32-NEXT: pushl %ebx -; CHECK32-NEXT: pushl %edi ; CHECK32-NEXT: pushl %esi ; CHECK32-NEXT: pushl %eax ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebp @@ -282,12 +281,11 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi -; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %edi ; CHECK32-NEXT: lock cmpxchg8b (%esi) +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK32-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK32-NEXT: movl %ebp, %edx -; CHECK32-NEXT: movl %edi, %ecx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK32-NEXT: lock cmpxchg8b (%esi) @@ -304,7 +302,6 @@ define i64 @test_two_live_flags(ptr %foo0, i64 %bar0, i64 %baz0, ptr %foo1, i64 ; CHECK32-NEXT: xorl %edx, %edx ; CHECK32-NEXT: addl $4, %esp ; CHECK32-NEXT: popl %esi -; CHECK32-NEXT: popl %edi ; CHECK32-NEXT: popl %ebx ; CHECK32-NEXT: popl %ebp ; CHECK32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll index b60b36744f038..f2b990f0bad62 100644 --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -1114,9 +1114,8 @@ define <32 x i32> @zext_mulhuw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: zext_mulhuw_v32i16_lshr: @@ -1209,9 +1208,8 @@ define <32 x i32> @mulhsw_v32i16_lshr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhsw_v32i16_lshr: @@ -1310,9 +1308,8 @@ define <32 x i32> @mulhsw_v32i16_ashr(<32 x i16> %a, <32 x i16> %b) { ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: mulhsw_v32i16_ashr: @@ -1484,10 +1481,9 @@ define <64 x i32> @zext_mulhuw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmulhuw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: retq ; @@ -1665,10 +1661,9 @@ define <64 x i32> @mulhsw_v64i16_lshr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: retq ; @@ -1847,10 +1842,9 @@ define <64 x i32> @mulhsw_v64i16_ashr(<64 x i16> %a, <64 x i16> %b) { ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 ; AVX512F-NEXT: vpmulhw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm3 -; AVX512F-NEXT: vmovdqa64 %zmm4, %zmm0 +; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm0 ; AVX512F-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll index ea72b64ee00c0..29922c2ac1a71 100644 --- a/llvm/test/CodeGen/X86/pr34177.ll +++ b/llvm/test/CodeGen/X86/pr34177.ll @@ -49,13 +49,13 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr { ; AVX512VL-LABEL: test: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0 +; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: kmovd %k0, %eax ; AVX512VL-NEXT: testb $2, %al ; AVX512VL-NEXT: fld1 ; AVX512VL-NEXT: fldz ; AVX512VL-NEXT: fld %st(0) ; AVX512VL-NEXT: fcmovne %st(2), %st -; AVX512VL-NEXT: kshiftrb $2, %k0, %k1 ; AVX512VL-NEXT: testb $1, %al ; AVX512VL-NEXT: fld %st(1) ; AVX512VL-NEXT: fcmovne %st(3), %st diff --git a/llvm/test/CodeGen/X86/pr61964.ll b/llvm/test/CodeGen/X86/pr61964.ll index 4fea9c8cffec6..afb4a51769d61 100644 --- a/llvm/test/CodeGen/X86/pr61964.ll +++ b/llvm/test/CodeGen/X86/pr61964.ll @@ -22,9 +22,8 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splitTransposeDecode_8_avx2: @@ -58,9 +57,8 @@ define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 -; XOPAVX1-NEXT: vmovaps %ymm2, %ymm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: splitTransposeDecode_8_avx2: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 779999816ebbf..36866cf47aa25 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -428,7 +428,6 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp ; i686-NEXT: sarl $31, %eax ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -450,9 +449,9 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: shll %cl, %edx ; i686-NEXT: orl %eax, %edx +; i686-NEXT: movl {{[0-9]+}}(%esp), %eax ; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebp, %eax -; i686-NEXT: movl %ebp, %edx +; i686-NEXT: movl %eax, %edx ; i686-NEXT: andl $7, %edx ; i686-NEXT: shrl $3, %eax ; i686-NEXT: andl $15, %eax @@ -551,23 +550,23 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: pushl %esi ; i686-NEXT: subl $100, %esp ; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl {{[0-9]+}}(%esp), %edi ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebp -; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl {{[0-9]+}}(%esp), %ebx ; i686-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx +; i686-NEXT: movl {{[0-9]+}}(%esp), %eax +; i686-NEXT: movl {{[0-9]+}}(%esp), %edx +; i686-NEXT: movl {{[0-9]+}}(%esp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ebp, %ecx +; i686-NEXT: movl %ecx, %ebp ; i686-NEXT: shrl $3, %ebp ; i686-NEXT: andl $15, %ebp ; i686-NEXT: leal {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll index 0771beecda770..2a748b6052421 100644 --- a/llvm/test/CodeGen/X86/smul_fix.ll +++ b/llvm/test/CodeGen/X86/smul_fix.ll @@ -166,17 +166,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shldl $30, %eax, %ebp diff --git a/llvm/test/CodeGen/X86/smul_fix_sat.ll b/llvm/test/CodeGen/X86/smul_fix_sat.ll index 14db7ac90ef57..177543aff1e8a 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat.ll @@ -265,7 +265,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ecx @@ -279,12 +278,12 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmovll %esi, %ecx ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %edi ; X86-NEXT: cmpl $2, %edx ; X86-NEXT: cmovgel %ebp, %edi ; X86-NEXT: cmpl $-2, %edx ; X86-NEXT: cmovll %esi, %edi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: shrdl $2, %edx, %ebx diff --git a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll index 8a5d3bb093677..b4f12123424c6 100644 --- a/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/smulo-128-legalisation-lowering.ll @@ -91,12 +91,11 @@ define zeroext i1 @smuloi128(i128 %v1, i128 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %esi ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx @@ -585,12 +584,11 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: mull %ebx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, %esi ; X86-NEXT: mull %ebx ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebx @@ -1293,9 +1291,9 @@ define zeroext i1 @smuloi256(i256 %v1, i256 %v2, ptr %res) { ; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill ; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill +; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index 554548fa8f4c3..60b9b62e94ab2 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -1661,43 +1661,41 @@ define void @vec256_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 11(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: movzbl 10(%rdi), %ebp -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: notb %r10b -; SCALAR-NEXT: movzbl 9(%rdi), %r14d +; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 9(%rdi), %r10d ; SCALAR-NEXT: movzbl 8(%rdi), %eax ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl 7(%rdi), %r12d ; SCALAR-NEXT: movzbl 5(%rdi), %r9d ; SCALAR-NEXT: movzbl 4(%rdi), %ebx -; SCALAR-NEXT: movzbl 12(%rdi), %r15d ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax ; SCALAR-NEXT: movzbl 1(%rdi), %r13d +; SCALAR-NEXT: movzbl 12(%rdi), %edi ; SCALAR-NEXT: notb %al ; SCALAR-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r13b ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %bl ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r12b ; SCALAR-NEXT: movb %r12b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r11d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r11b -; SCALAR-NEXT: movl %r14d, %r10d ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: notb %bpl ; SCALAR-NEXT: movl %ebp, %r14d ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload ; SCALAR-NEXT: notb %r8b ; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %r15d, %edi ; SCALAR-NEXT: notb %dil ; SCALAR-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload @@ -4757,9 +4755,9 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 7(%rdi), %ebx ; SCALAR-NEXT: movzbl 6(%rdi), %r10d ; SCALAR-NEXT: movzbl 4(%rdi), %r9d -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl 5(%rdi), %r15d +; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 5(%rdi), %r9d ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax @@ -4771,8 +4769,7 @@ define void @vec384_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %r15d, %r9d +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: notb %r10b ; SCALAR-NEXT: notb %bl @@ -6695,9 +6692,9 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl 7(%rdi), %ebp ; SCALAR-NEXT: movzbl 6(%rdi), %r11d ; SCALAR-NEXT: movzbl 4(%rdi), %r9d -; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b -; SCALAR-NEXT: movzbl 5(%rdi), %ebx +; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SCALAR-NEXT: movzbl 5(%rdi), %r9d ; SCALAR-NEXT: movzbl 3(%rdi), %r8d ; SCALAR-NEXT: movzbl 2(%rdi), %ecx ; SCALAR-NEXT: movzbl (%rdi), %eax @@ -6709,8 +6706,7 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: notb %cl ; SCALAR-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r8b -; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SCALAR-NEXT: movl %ebx, %r9d +; SCALAR-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: notb %r9b ; SCALAR-NEXT: notb %r11b ; SCALAR-NEXT: movl %r11d, %ebx @@ -6762,13 +6758,14 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movb %r8b, 14(%rdx) ; SCALAR-NEXT: movb %al, 13(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload -; SCALAR-NEXT: movb %r8b, 4(%rdx) -; SCALAR-NEXT: movb %al, 12(%rdx) +; SCALAR-NEXT: movb %sil, 9(%rdx) +; SCALAR-NEXT: movl %r10d, %esi +; SCALAR-NEXT: movb %r10b, (%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 12(%rdx) ; SCALAR-NEXT: movb %r13b, 11(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload ; SCALAR-NEXT: movb %r15b, 10(%rdx) -; SCALAR-NEXT: movb %sil, 9(%rdx) ; SCALAR-NEXT: movb %r12b, 8(%rdx) ; SCALAR-NEXT: movb %r14b, 7(%rdx) ; SCALAR-NEXT: movb %bl, 6(%rdx) @@ -6776,13 +6773,11 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movl %r9d, %r11d ; SCALAR-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload +; SCALAR-NEXT: movb %r8b, 4(%rdx) ; SCALAR-NEXT: movb %bpl, 3(%rdx) ; SCALAR-NEXT: movb %dil, 2(%rdx) ; SCALAR-NEXT: movb %cl, 1(%rdx) ; SCALAR-NEXT: movl %ecx, %r14d -; SCALAR-NEXT: movb %r10b, (%rdx) -; SCALAR-NEXT: movb %al, 22(%rdx) -; SCALAR-NEXT: movl %r10d, %esi ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload ; SCALAR-NEXT: movb %cl, 31(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 1-byte Folded Reload @@ -6800,6 +6795,7 @@ define void @vec512_v16i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec. ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ebx # 1-byte Folded Reload ; SCALAR-NEXT: movb %bl, 23(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload +; SCALAR-NEXT: movb %al, 22(%rdx) ; SCALAR-NEXT: movb %r11b, 21(%rdx) ; SCALAR-NEXT: movb %r8b, 20(%rdx) ; SCALAR-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index f357e57b30599..e255855d38232 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -122,17 +122,15 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: shldl $30, %eax, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: shldl $30, %eax, %ebx -; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: shldl $30, %eax, %ebp diff --git a/llvm/test/CodeGen/X86/umul_fix_sat.ll b/llvm/test/CodeGen/X86/umul_fix_sat.ll index f40276822b3fa..4c54e06786ccc 100644 --- a/llvm/test/CodeGen/X86/umul_fix_sat.ll +++ b/llvm/test/CodeGen/X86/umul_fix_sat.ll @@ -194,7 +194,6 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi @@ -205,10 +204,10 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmovael %ecx, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: shrdl $2, %edx, %ebx ; X86-NEXT: cmpl $4, %edx ; X86-NEXT: cmovael %ecx, %ebx -; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: shrdl $2, %edx, %ebp @@ -390,7 +389,6 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %esi @@ -399,8 +397,8 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: cmovol %edi, %esi ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmovol %edi, %ebx -; X86-NEXT: movl %ebp, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: cmovol %edi, %ebp diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 00898ab313a3c..b7fe4f0062a40 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -526,10 +526,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm1, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm0 ; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vmovq %xmm1, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v6i32: @@ -544,11 +544,10 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovq %xmm1, 16(%rdi) ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm0 +; AVX2-NEXT: vmovq %xmm1, 16(%rdi) ; AVX2-NEXT: retq ; ; AVX512-LABEL: umulo_v6i32: @@ -697,10 +696,10 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { ; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5 ; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) ; AVX1-NEXT: vmovdqa %xmm0, (%rdi) ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm0 ; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 +; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) ; AVX1-NEXT: retq ; ; AVX2-LABEL: umulo_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index e71aa794640f7..72c9fcceb6933 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -947,16 +947,16 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no ; SSE-NEXT: movdqa (%rdi), %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: pslld $16, %xmm13 -; SSE-NEXT: movdqa 16(%rdi), %xmm7 ; SSE-NEXT: psrad $16, %xmm13 ; SSE-NEXT: packssdw %xmm0, %xmm13 +; SSE-NEXT: movdqa 16(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pslld $16, %xmm0 ; SSE-NEXT: psrad $16, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll index 26f2f17ad2404..e3d1d668b9d43 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll @@ -1233,8 +1233,6 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $40, %rsp ; SSE-NEXT: movdqa 176(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm15 @@ -1244,59 +1242,60 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0] -; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm12, %xmm2 ; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: movdqa 160(%rdi), %xmm9 -; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1] -; SSE-NEXT: movdqa %xmm11, %xmm8 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 ; SSE-NEXT: movdqa %xmm15, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm11 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 96(%rdi), %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm9, %xmm7 -; SSE-NEXT: pandn %xmm9, %xmm2 +; SSE-NEXT: pandn %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm1, %xmm3 ; SSE-NEXT: por %xmm2, %xmm3 -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0] -; SSE-NEXT: movdqa 112(%rdi), %xmm6 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm8 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm11 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm5, %xmm2 ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index 0f9f83bafdf93..7a6329816a956 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -1880,10 +1880,10 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%rdi), %xmm10 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7] +; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 +; AVX-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7] ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll index 22262b414df1a..22b987544e1c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -2416,27 +2416,27 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm12 ; SSE-NEXT: movdqa 208(%rdi), %xmm3 -; SSE-NEXT: movaps 192(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 192(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 ; SSE-NEXT: pandn %xmm2, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm3, %xmm8 ; SSE-NEXT: pand %xmm0, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] -; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm15, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] ; SSE-NEXT: movaps %xmm15, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 @@ -3675,13 +3675,13 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 -; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] -; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] @@ -4888,35 +4888,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 464(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa 416(%rdi), %xmm11 -; SSE-NEXT: movdqa 448(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 432(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm15 ; SSE-NEXT: movdqa 96(%rdi), %xmm10 ; SSE-NEXT: movdqa 128(%rdi), %xmm14 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa 400(%rdi), %xmm8 ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] +; SSE-NEXT: movdqa 400(%rdi), %xmm8 +; SSE-NEXT: movdqa 416(%rdi), %xmm11 +; SSE-NEXT: movdqa 448(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 432(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] ; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm13, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] ; SSE-NEXT: movaps %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 @@ -6561,7 +6561,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: subq $1048, %rsp # imm = 0x418 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm7 -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm8 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 @@ -6573,7 +6572,6 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] @@ -6587,6 +6585,8 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 ; AVX2-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll index a53ca1ccaa668..e55f8be61bc53 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll @@ -1580,22 +1580,18 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $136, %rsp ; SSE-NEXT: movdqa 64(%rdi), %xmm2 -; SSE-NEXT: movdqa 128(%rdi), %xmm7 ; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa (%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rdi), %xmm6 ; SSE-NEXT: movdqa 32(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535] ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: movdqa 112(%rdi), %xmm9 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 @@ -1605,16 +1601,22 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; SSE-NEXT: movdqa %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[3,0] -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm11[0,0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm11[2,3] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pslld $16, %xmm11 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3] +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,0,2,4,5,6,7] +; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] +; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] +; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa 128(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm0[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1625,10 +1627,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa 160(%rdi), %xmm14 @@ -1664,7 +1664,6 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: pandn %xmm15, %xmm13 ; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: por %xmm13, %xmm9 -; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[1,3] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm12[2,0] ; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: psrld $16, %xmm4 @@ -4165,15 +4164,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3] ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1] -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm5 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7] ; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm4 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15] ; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm0 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -4183,9 +4182,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpblendvb %ymm0, %ymm9, %ymm4, %ymm4 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa %ymm11, %ymm5 -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] +; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7] ; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll index ec9f87b201a95..312fa596cd1fe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -2028,24 +2028,25 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm13 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] ; SSE-NEXT: movdqa 176(%rdi), %xmm15 ; SSE-NEXT: movdqa 208(%rdi), %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0] -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movaps 144(%rdi), %xmm7 ; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm2 +; SSE-NEXT: movaps 144(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1] +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: movaps %xmm7, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2] ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535] @@ -4151,9 +4152,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm7 ; SSE-NEXT: movdqa 192(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm12 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0] ; SSE-NEXT: movdqa %xmm1, %xmm11 @@ -4161,14 +4160,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1] -; SSE-NEXT: movaps 144(%rdi), %xmm10 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movaps 144(%rdi), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2] ; SSE-NEXT: movaps %xmm7, %xmm10 @@ -7702,7 +7703,6 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7] ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1] ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15] -; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12 @@ -7714,6 +7714,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1} ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7] ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 @@ -8632,12 +8633,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 208(%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa 176(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: pand %xmm3, %xmm0 @@ -13252,22 +13253,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] +; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7] @@ -13278,12 +13272,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7] +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7] +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -14108,7 +14109,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm19 -; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vmovdqa %ymm5, %ymm1 +; AVX512-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21 @@ -14174,6 +14176,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm16 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2] ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX512-NEXT: vmovdqa 688(%rdi), %xmm3 @@ -14191,8 +14194,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15] ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm14 ; AVX512-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1 -; AVX512-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7] ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7] @@ -14479,6 +14480,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqa %ymm1, %ymm15 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7] ; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm25 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3] @@ -14486,7 +14488,6 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3] ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX512-NEXT: vmovdqa %ymm1, %ymm15 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll index ff1e9cf28f2ea..7765b40fe4f29 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -3459,27 +3459,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 208(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 176(%rdi), %xmm7 -; SSE-NEXT: movdqa 144(%rdi), %xmm6 -; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm0 -; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movdqa %xmm0, %xmm7 -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3] ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0] +; SSE-NEXT: movdqa 192(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 144(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0] @@ -4474,8 +4474,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -4498,8 +4500,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4935,8 +4935,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] +; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -4959,8 +4961,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5396,8 +5396,10 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm14 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0,0,1,1] +; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] @@ -5420,8 +5422,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6288,7 +6288,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] @@ -6298,9 +6297,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 -; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -7061,7 +7060,6 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7] @@ -7071,9 +7069,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm22[2],xmm16[3],xmm22[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15] @@ -12887,9 +12885,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] ; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512-NEXT: vmovdqa 560(%rdi), %xmm1 @@ -12975,15 +12974,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vmovdqa %xmm8, %xmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512-NEXT: vmovdqa64 %xmm16, %xmm4 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -13056,6 +13052,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -13389,11 +13386,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] -; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] @@ -13456,6 +13452,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -13852,6 +13849,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 @@ -13860,9 +13858,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] @@ -14727,9 +14724,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3] ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] -; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2 ; AVX512DQ-NEXT: vmovdqa 560(%rdi), %xmm1 @@ -14815,15 +14813,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7] -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1} ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] -; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm4 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] @@ -14896,6 +14891,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15] +; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] @@ -15229,11 +15225,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1] ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] -; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] -; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5 +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] @@ -15296,6 +15291,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7] +; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14] ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] @@ -15692,6 +15688,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9 @@ -15700,9 +15697,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13] ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15] -; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12 +; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll index 65e3ba8b8200b..515ee69203edd 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -728,8 +728,6 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-LABEL: load_i32_stride3_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movaps 144(%rdi), %xmm11 -; SSE-NEXT: movaps 128(%rdi), %xmm1 -; SSE-NEXT: movaps 112(%rdi), %xmm13 ; SSE-NEXT: movaps 176(%rdi), %xmm10 ; SSE-NEXT: movaps 160(%rdi), %xmm9 ; SSE-NEXT: movaps (%rdi), %xmm7 @@ -738,26 +736,28 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm15 ; SSE-NEXT: movaps 80(%rdi), %xmm14 +; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] ; SSE-NEXT: movaps %xmm15, %xmm5 -; SSE-NEXT: movaps 64(%rdi), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 -; SSE-NEXT: movaps %xmm7, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm8[0,2] -; SSE-NEXT: movaps 96(%rdi), %xmm6 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm3 -; SSE-NEXT: movaps %xmm11, %xmm4 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdi), %xmm6 +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps 112(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm1, %xmm12 @@ -1199,41 +1199,41 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $392, %rsp # imm = 0x188 ; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: movaps 224(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 208(%rdi), %xmm14 ; SSE-NEXT: movaps 272(%rdi), %xmm6 ; SSE-NEXT: movaps 256(%rdi), %xmm9 ; SSE-NEXT: movaps 48(%rdi), %xmm2 -; SSE-NEXT: movaps 32(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps 64(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, %xmm0 -; SSE-NEXT: movaps 80(%rdi), %xmm1 -; SSE-NEXT: movaps (%rdi), %xmm13 -; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm1, %xmm12 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,2] -; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[1,0] -; SSE-NEXT: movaps %xmm6, %xmm10 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rdi), %xmm8 +; SSE-NEXT: movaps 32(%rdi), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] +; SSE-NEXT: movaps 192(%rdi), %xmm4 +; SSE-NEXT: movaps 224(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps (%rdi), %xmm13 ; SSE-NEXT: movaps %xmm13, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps 208(%rdi), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, %xmm10 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] @@ -1763,13 +1763,13 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm3 -; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] ; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm3 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] ; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm2 ; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm0 @@ -2146,21 +2146,21 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 448(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 48(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 272(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm12 ; SSE-NEXT: movaps %xmm12, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] ; SSE-NEXT: movaps %xmm9, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 240(%rdi), %xmm7 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[1,0] +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 272(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 256(%rdi), %xmm13 +; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm13, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] ; SSE-NEXT: movaps %xmm7, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll index aa23dcc824c72..1cb51b42d9bc1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -1035,25 +1035,25 @@ define void @load_i32_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3,0,1] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm1[0],ymm3[2],ymm1[2] ; AVX-NEXT: vmovaps %ymm3, %ymm14 -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps %ymm1, %ymm15 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX-NEXT: vunpcklps {{.*#+}} ymm7 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX-NEXT: vmovaps %ymm2, %ymm10 -; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] ; AVX-NEXT: vmovaps %ymm1, %ymm3 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 176(%rdi), %xmm6 ; AVX-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm1[0] ; AVX-NEXT: vmovaps %xmm6, %xmm2 -; AVX-NEXT: vmovaps 96(%rdi), %ymm4 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovaps 144(%rdi), %xmm1 ; AVX-NEXT: vmovaps 128(%rdi), %xmm6 -; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vunpcklps {{.*#+}} xmm12 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,0] +; AVX-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,0],ymm7[4,5],ymm0[6,4] +; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm4[2,3,0,1] @@ -1777,16 +1777,11 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 304(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 336(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 320(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 368(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 352(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 80(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 64(%rdi), %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 112(%rdi), %xmm4 @@ -1798,17 +1793,22 @@ define void @load_i32_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 288(%rdi), %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1] ; SSE-NEXT: movaps %xmm6, %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 288(%rdi), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1] ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, %xmm0 @@ -3516,14 +3516,9 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 176(%rdi), %xmm11 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 208(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps 240(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 80(%rdi), %xmm10 ; SSE-NEXT: movaps 64(%rdi), %xmm4 ; SSE-NEXT: movaps 112(%rdi), %xmm2 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3532,21 +3527,26 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movaps %xmm4, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] -; SSE-NEXT: movaps 224(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 224(%rdi), %xmm7 ; SSE-NEXT: movaps %xmm7, %xmm1 -; SSE-NEXT: movaps 160(%rdi), %xmm5 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE-NEXT: movaps 192(%rdi), %xmm8 ; SSE-NEXT: movaps %xmm8, %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdi), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1] ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 @@ -4674,7 +4674,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm14 @@ -4682,14 +4681,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-NEXT: vpermps %ymm9, %ymm2, %ymm3 +; AVX2-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5167,7 +5167,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm14 @@ -5175,14 +5174,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FP-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FP-NEXT: vpermps %ymm9, %ymm2, %ymm3 +; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5660,7 +5660,6 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm14 @@ -5668,14 +5667,15 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermps %ymm14, %ymm2, %ymm0 ; AVX2-FCP-NEXT: vpermps %ymm10, %ymm2, %ymm1 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 144(%rdi), %xmm3 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FCP-NEXT: vpermps %ymm9, %ymm2, %ymm3 +; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll index 1238b1c097628..c89040d9388d0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -688,41 +688,41 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movapd 80(%rdi), %xmm11 +; SSE-NEXT: movdqa 80(%rdi), %xmm11 ; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: movdqa (%rdi), %xmm14 ; SSE-NEXT: movdqa 16(%rdi), %xmm7 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,1,1] ; SSE-NEXT: movdqa %xmm14, %xmm8 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,2,2,2] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3] -; SSE-NEXT: movdqa 64(%rdi), %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; SSE-NEXT: movapd %xmm11, %xmm10 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm8[0],xmm6[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] +; SSE-NEXT: movdqa %xmm11, %xmm10 +; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[2,2,2,2] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm7[2,3,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] -; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa 144(%rdi), %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm14[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] -; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,0,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[2,3,2,3] +; SSE-NEXT: movdqa 144(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm3[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1] ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[2,2,3,3] @@ -1294,22 +1294,23 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 240(%rdi), %xmm14 ; SSE-NEXT: movdqa 256(%rdi), %xmm8 ; SSE-NEXT: movdqa 208(%rdi), %xmm6 -; SSE-NEXT: movdqa 16(%rdi), %xmm15 -; SSE-NEXT: movdqa 32(%rdi), %xmm5 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd 160(%rdi), %xmm10 +; SSE-NEXT: movdqa 192(%rdi), %xmm7 +; SSE-NEXT: movdqa 160(%rdi), %xmm10 ; SSE-NEXT: movdqa 176(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1] +; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa (%rdi), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 16(%rdi), %xmm15 +; SSE-NEXT: movdqa 32(%rdi), %xmm5 +; SSE-NEXT: movdqa 48(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm9 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE-NEXT: movdqa %xmm11, %xmm1 @@ -1343,7 +1344,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -1666,10 +1667,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] @@ -1683,7 +1683,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1705,6 +1704,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-NEXT: vpbroadcastd 144(%rdi), %ymm13 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] @@ -1714,8 +1715,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm15 +; AVX2-NEXT: vpermd %ymm12, %ymm0, %ymm15 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] @@ -1784,10 +1784,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] @@ -1801,7 +1800,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1823,6 +1821,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FP-NEXT: vpbroadcastd 144(%rdi), %ymm13 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] @@ -1832,8 +1832,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm12, %ymm0, %ymm15 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] @@ -1902,10 +1901,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,5,2,7] @@ -1919,7 +1917,6 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm8[4],ymm11[5,6,7] -; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7] @@ -1941,6 +1938,8 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm13 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [2,7,4,0] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] @@ -1950,8 +1949,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5] -; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm15 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] @@ -2520,40 +2518,42 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 32(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%rdi), %xmm8 ; SSE-NEXT: movdqa 448(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 432(%rdi), %xmm4 -; SSE-NEXT: movdqa 416(%rdi), %xmm14 -; SSE-NEXT: movapd 128(%rdi), %xmm6 -; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 128(%rdi), %xmm6 ; SSE-NEXT: movdqa 112(%rdi), %xmm7 +; SSE-NEXT: movdqa 80(%rdi), %xmm12 +; SSE-NEXT: movdqa 96(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa 96(%rdi), %xmm15 -; SSE-NEXT: movapd 80(%rdi), %xmm12 -; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm15 +; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 400(%rdi), %xmm10 +; SSE-NEXT: movdqa 416(%rdi), %xmm14 +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] +; SSE-NEXT: movdqa (%rdi), %xmm11 +; SSE-NEXT: movdqa 32(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, %xmm6 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: movdqa %xmm11, %xmm5 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] @@ -2562,11 +2562,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 320(%rdi), %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 320(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 368(%rdi), %xmm2 @@ -3353,16 +3353,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -3370,13 +3366,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX2-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] @@ -3632,16 +3632,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -3649,13 +3645,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] @@ -3911,16 +3911,12 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm14 +; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] @@ -3928,13 +3924,17 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7] +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] @@ -5005,22 +5005,22 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 432(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 416(%rdi), %xmm9 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE-NEXT: movdqa 400(%rdi), %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2] ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movdqa 400(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 416(%rdi), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -6742,34 +6742,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX2-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX2-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7299,34 +7299,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 +; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX2-FP-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -7856,34 +7856,34 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 -; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm11 -; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm13 -; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,2,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7] -; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] -; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm7 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 +; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,0,3] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] +; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] -; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 +; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9 +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm9[0,1,0,3] +; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm9 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll index 864e41510030b..1a56d2a4b452d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -1664,47 +1664,47 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 240(%rdi), %xmm9 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 336(%rdi), %xmm14 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 304(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 352(%rdi), %xmm5 ; SSE-NEXT: movdqa 64(%rdi), %xmm12 -; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movapd (%rdi), %xmm8 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm1, %xmm11 -; SSE-NEXT: movdqa 256(%rdi), %xmm3 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm8, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3] -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm8[0],xmm2[1] +; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] +; SSE-NEXT: movdqa 352(%rdi), %xmm5 +; SSE-NEXT: movdqa 288(%rdi), %xmm15 +; SSE-NEXT: movdqa 304(%rdi), %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 256(%rdi), %xmm2 +; SSE-NEXT: movdqa 192(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm4, %xmm14 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,3,3] +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1] @@ -2145,13 +2145,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] @@ -2160,6 +2155,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vmovaps %ymm0, %ymm7 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2322,13 +2322,8 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm5 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm13 ; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm6 = [0,6,4,u] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] @@ -2337,6 +2332,11 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 +; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm3 +; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FP-NEXT: vmovaps %ymm0, %ymm7 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2502,18 +2502,17 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4 -; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 +; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm15 +; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [0,6,4,u] ; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm7 -; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm5 -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm1[0,1] -; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm15[0,1],ymm5[0,1] +; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm0[4,5,6,7] @@ -3397,23 +3396,25 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 496(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; SSE-NEXT: movdqa 96(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 480(%rdi), %xmm8 -; SSE-NEXT: movdqa 112(%rdi), %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm11 +; SSE-NEXT: movdqa %xmm6, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,0,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] +; SSE-NEXT: movdqa 480(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: movdqa %xmm8, %xmm4 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll index b268c4a984cc1..a69825205cc04 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -2019,26 +2019,20 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE: # %bb.0: ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8 ; SSE-NEXT: movdqa 240(%rdi), %xmm6 -; SSE-NEXT: movdqa 272(%rdi), %xmm5 -; SSE-NEXT: movdqa 224(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 80(%rdi), %xmm7 ; SSE-NEXT: movdqa (%rdi), %xmm2 ; SSE-NEXT: movdqa 16(%rdi), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 48(%rdi), %xmm9 ; SSE-NEXT: movdqa 192(%rdi), %xmm14 +; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 -; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,3,3] -; SSE-NEXT: movdqa %xmm11, %xmm12 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2046,12 +2040,18 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] -; SSE-NEXT: movdqa %xmm9, %xmm11 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] +; SSE-NEXT: movdqa 304(%rdi), %xmm3 +; SSE-NEXT: movdqa 272(%rdi), %xmm5 +; SSE-NEXT: movdqa 224(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, %xmm12 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm11 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3] @@ -4211,41 +4211,41 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 80(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 640(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 48(%rdi), %xmm5 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 608(%rdi), %xmm4 -; SSE-NEXT: movdqa 576(%rdi), %xmm1 ; SSE-NEXT: movdqa 560(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm7 ; SSE-NEXT: movdqa 160(%rdi), %xmm9 ; SSE-NEXT: movdqa 112(%rdi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 192(%rdi), %xmm7 -; SSE-NEXT: movaps 128(%rdi), %xmm0 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] +; SSE-NEXT: movdqa 128(%rdi), %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: movdqa %xmm7, %xmm14 -; SSE-NEXT: movdqa (%rdi), %xmm13 -; SSE-NEXT: movdqa %xmm9, %xmm12 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,3,3] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] +; SSE-NEXT: movdqa (%rdi), %xmm13 +; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 48(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, %xmm12 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, %xmm14 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1] ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: movdqa %xmm13, %xmm2 @@ -8552,22 +8552,22 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movdqa 608(%rdi), %xmm6 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 576(%rdi), %xmm7 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa 160(%rdi), %xmm15 +; SSE-NEXT: movdqa 112(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] -; SSE-NEXT: movdqa 560(%rdi), %xmm10 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,3,3] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE-NEXT: movdqa 560(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 576(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3] ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -11205,20 +11205,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -12235,20 +12235,20 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm12 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] +; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11 +; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -13263,21 +13263,19 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8 -; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3 -; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] -; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 +; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm7 +; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,7,6,0] ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm11 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FCP-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -13289,9 +13287,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm13 +; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FCP-NEXT: vpbroadcastq 752(%rdi), %ymm2 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll index 9448acd134008..68e68f62cfd00 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -2429,12 +2429,12 @@ define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] +; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX-NEXT: vmovaps 480(%rdi), %ymm9 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] +; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] @@ -4402,38 +4402,38 @@ define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 608(%rdi), %xmm6 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 672(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 640(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 736(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 704(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 576(%rdi), %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm4, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 576(%rdi), %xmm7 +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm7, %xmm0 @@ -9222,38 +9222,38 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; SSE-NEXT: movaps 352(%rdi), %xmm5 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill ; SSE-NEXT: movaps 416(%rdi), %xmm7 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 384(%rdi), %xmm8 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 480(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 448(%rdi), %xmm3 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 160(%rdi), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 224(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 192(%rdi), %xmm0 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdi), %xmm1 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movaps 128(%rdi), %xmm1 +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1] ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 320(%rdi), %xmm6 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] ; SSE-NEXT: movaps %xmm8, %xmm3 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm0 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 320(%rdi), %xmm6 +; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm6, %xmm0 @@ -15802,49 +15802,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512-NEXT: movb $-64, %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -15853,21 +15847,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16399,49 +16398,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512-FCP-NEXT: movb $-64, %al ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -16450,21 +16443,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -16996,49 +16994,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-NEXT: movb $-64, %al ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -17047,21 +17039,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17593,49 +17590,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: movb $-64, %al ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -17644,21 +17635,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18190,49 +18186,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512BW-NEXT: movb $-64, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -18241,21 +18231,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18787,49 +18782,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: movb $-64, %al ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -18838,21 +18827,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19384,49 +19378,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: movb $-64, %al ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -19435,21 +19423,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19981,49 +19974,43 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12 @@ -20032,21 +20019,26 @@ define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll index 9ed29fc54dbc1..ae144d9e4f498 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll @@ -1418,22 +1418,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 -; AVX2-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 ; AVX2-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 256(%rdi), %xmm10 ; AVX2-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -1443,8 +1443,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1536,22 +1535,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-FP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 -; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-FP-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 ; AVX2-FP-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm10 ; AVX2-FP-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-FP-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -1561,8 +1560,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FP-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] @@ -1654,22 +1652,22 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm7 ; AVX2-FCP-NEXT: vinsertf128 $1, 480(%rdi), %ymm7, %ymm11 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm7 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vinsertf128 $1, 448(%rdi), %ymm7, %ymm12 -; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] +; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm8 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-FCP-NEXT: vinsertf128 $1, 96(%rdi), %ymm13, %ymm13 ; AVX2-FCP-NEXT: vinsertf128 $1, 64(%rdi), %ymm8, %ymm15 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm14 -; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm9[1],ymm10[3],ymm9[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm10 ; AVX2-FCP-NEXT: vinsertf128 $1, 320(%rdi), %ymm10, %ymm0 +; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm14 +; AVX2-FCP-NEXT: vinsertf128 $1, 352(%rdi), %ymm14, %ymm1 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] @@ -1679,8 +1677,7 @@ define void @load_i64_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FCP-NEXT: vmovaps %ymm5, %ymm10 -; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm14[0],ymm5[2],ymm14[2] +; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm14[0],ymm10[2],ymm14[2] ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm13[0],ymm6[2],ymm13[2] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll index 1ef07aabc54c9..70b046ac4d1fe 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll @@ -798,18 +798,18 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-LABEL: load_i64_stride5_vf8: ; AVX: # %bb.0: ; AVX-NEXT: vmovapd 128(%rdi), %ymm1 -; AVX-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX-NEXT: vmovapd 224(%rdi), %ymm9 -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm2[6,7] +; AVX-NEXT: vmovapd 96(%rdi), %ymm2 ; AVX-NEXT: vmovapd 64(%rdi), %ymm7 -; AVX-NEXT: vmovaps (%rdi), %xmm10 -; AVX-NEXT: vmovaps 32(%rdi), %xmm4 +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm7[0,1,2],ymm2[3] +; AVX-NEXT: vmovapd (%rdi), %xmm10 +; AVX-NEXT: vmovapd 32(%rdi), %xmm4 +; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm10[0],xmm4[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3] +; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovapd 256(%rdi), %ymm0 +; AVX-NEXT: vmovapd 224(%rdi), %ymm9 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm8 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm11 -; AVX-NEXT: vmovapd 256(%rdi), %ymm0 -; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm4[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3] ; AVX-NEXT: vmovapd %ymm0, %ymm3 ; AVX-NEXT: vmovapd 192(%rdi), %xmm5 @@ -849,9 +849,9 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm15[2,3] -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3] +; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm4[0],xmm15[1] +; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3] ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm12[3] ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm5[0],xmm0[1] ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3] @@ -865,7 +865,7 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vmovapd %ymm11, 32(%r8) ; AVX-NEXT: vmovapd %ymm8, (%r8) ; AVX-NEXT: vmovapd %ymm0, 32(%r9) -; AVX-NEXT: vmovaps %ymm1, (%r9) +; AVX-NEXT: vmovapd %ymm1, (%r9) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -10965,11 +10965,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -11433,11 +11433,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -11901,11 +11901,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -12369,11 +12369,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -12837,11 +12837,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -13305,11 +13305,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -13773,11 +13773,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 @@ -14241,11 +14241,11 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll index 3dbd078504caa..ee5a98e0e59c1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll @@ -922,18 +922,18 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movaps 240(%rdi), %xmm3 ; SSE-NEXT: movaps 192(%rdi), %xmm12 ; SSE-NEXT: movaps 336(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movaps 288(%rdi), %xmm9 +; SSE-NEXT: movaps %xmm9, %xmm14 ; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm4[0] -; SSE-NEXT: movaps 96(%rdi), %xmm11 -; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm4[1] ; SSE-NEXT: movaps %xmm12, %xmm4 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm3[1] +; SSE-NEXT: movaps 96(%rdi), %xmm3 +; SSE-NEXT: movaps %xmm14, (%rsp) # 16-byte Spill +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm11, %xmm3 +; SSE-NEXT: movaps %xmm3, %xmm11 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm2[0] ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2503,8 +2503,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2: # %bb.0: ; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2513,10 +2511,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2524,9 +2520,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-NEXT: vmovaps 384(%rdi), %xmm6 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovaps 240(%rdi), %xmm11 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm9 @@ -2717,8 +2717,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2727,10 +2725,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-FP-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2738,9 +2734,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm6 ; AVX2-FP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm11 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm9 @@ -2931,8 +2931,6 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $488, %rsp # imm = 0x1E8 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm7 -; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm12 -; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm0 @@ -2941,10 +2939,8 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm5 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm1[0],xmm5[0] -; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[2],ymm0[2] ; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm15 -; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2952,9 +2948,13 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm6 ; AVX2-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm6[0],xmm3[0] ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[2],ymm7[2] -; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm12 +; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm11 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll index 16647d0da63c5..da66e846d4c95 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll @@ -9707,13 +9707,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -9726,6 +9730,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -9737,17 +9742,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -9793,11 +9799,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10180,13 +10181,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -10199,6 +10204,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -10210,17 +10216,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10266,11 +10273,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10653,13 +10655,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: subq $2760, %rsp # imm = 0xAC8 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -10672,6 +10678,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -10683,17 +10690,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -10739,11 +10747,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11126,13 +11129,17 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] +; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12 @@ -11145,6 +11152,7 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6 @@ -11156,17 +11164,18 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6] -; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [9,0,7,0,9,0,7,0] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3 @@ -11212,11 +11221,6 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm18, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -17264,38 +17268,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -17303,10 +17286,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -17315,10 +17298,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7 +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm2 @@ -17351,9 +17355,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 ; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18263,38 +18266,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -18302,10 +18284,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -18314,10 +18296,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 +; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 @@ -18350,9 +18353,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19262,38 +19264,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -19301,10 +19282,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -19313,10 +19294,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7 +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17 +; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm2 @@ -19349,9 +19351,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20261,38 +20262,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] @@ -20300,10 +20280,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -20312,10 +20292,31 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm2 @@ -20348,9 +20349,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21259,14 +21259,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm16 +; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21280,28 +21291,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -21347,9 +21347,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 @@ -22241,14 +22240,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22262,28 +22272,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -22329,9 +22328,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 @@ -23223,14 +23221,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23244,28 +23253,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 2704(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -23311,9 +23309,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2 @@ -24205,14 +24202,25 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] +; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24226,28 +24234,17 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2 @@ -24293,9 +24290,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll index 80f628099ee89..a9ee7c6c74217 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll @@ -4047,10 +4047,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4063,7 +4060,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -4075,16 +4071,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4098,20 +4093,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4119,23 +4109,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -4270,10 +4269,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4286,7 +4282,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -4298,16 +4293,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4321,20 +4315,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4342,23 +4331,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -4493,10 +4491,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4509,7 +4504,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -4521,16 +4515,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4544,20 +4537,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4565,23 +4553,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -4716,10 +4713,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4732,7 +4726,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -4744,16 +4737,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4767,20 +4759,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -4788,23 +4775,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -4939,10 +4935,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -4955,7 +4948,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -4967,16 +4959,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -4990,20 +4981,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5011,23 +4997,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -5162,10 +5157,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5178,7 +5170,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -5190,16 +5181,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5213,20 +5203,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5234,23 +5219,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -5385,10 +5379,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-BW-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5401,7 +5392,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -5413,16 +5403,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5436,20 +5425,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5457,23 +5441,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -5608,10 +5601,7 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm10 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30 @@ -5624,7 +5614,6 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm29, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm18 @@ -5636,16 +5625,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm21 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm19, %zmm21 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm23[0],ymm22[0],ymm23[2],ymm22[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm24 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm25 -; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm12 = ymm25[0],ymm24[0],ymm25[2],ymm24[2] +; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm12[2,3],ymm11[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm21, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11 @@ -5659,20 +5647,15 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm27 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm27[0],ymm26[0],ymm27[2],ymm26[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm13[2,3] +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm23[1],ymm22[1],ymm23[3],ymm22[3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] -; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm25[1],ymm24[1],ymm25[3],ymm24[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1 @@ -5680,23 +5663,32 @@ define void @load_i64_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm10, %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm12[1],ymm11[1],ymm12[3],ymm11[3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm27[1],ymm26[1],ymm27[3],ymm26[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,4,12,4,12,4,12] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm5[0],zmm16[0],zmm5[2],zmm16[2],zmm5[4],zmm16[4],zmm5[6],zmm16[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm21 = [1,9,1,9,1,9,1,9] +; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload @@ -8745,23 +8737,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-64, %al -; AVX512-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -8772,6 +8753,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -8786,6 +8772,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -8853,10 +8845,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -8986,6 +8978,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -8997,7 +8990,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -9266,23 +9258,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -9293,6 +9274,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -9307,6 +9293,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -9374,10 +9366,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -9507,6 +9499,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -9518,7 +9511,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -9787,23 +9779,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-64, %al -; AVX512DQ-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -9814,6 +9795,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -9828,6 +9814,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -9895,10 +9887,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10028,6 +10020,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -10039,7 +10032,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -10308,23 +10300,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-64, %al -; AVX512DQ-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -10335,6 +10316,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -10349,6 +10335,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -10416,10 +10408,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10549,6 +10541,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -10560,7 +10553,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -10829,23 +10821,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -10856,6 +10837,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -10870,6 +10856,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -10937,10 +10929,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11070,6 +11062,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -11081,7 +11074,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -11350,23 +11342,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $-64, %al -; AVX512BW-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -11377,6 +11358,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -11391,6 +11377,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -11458,10 +11450,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11591,6 +11583,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -11602,7 +11595,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -11871,23 +11863,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $-64, %al -; AVX512DQ-BW-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -11898,6 +11879,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -11912,6 +11898,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -11979,10 +11971,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12112,6 +12104,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -12123,7 +12116,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -12389,26 +12381,15 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: movb $-64, %al -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: movb $-64, %al ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %ymm22 @@ -12419,6 +12400,11 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm13 = ymm6[0],ymm9[0],ymm6[2],ymm9[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm2, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm20 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm2 @@ -12433,6 +12419,12 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm21[0],ymm17[0],ymm21[2],ymm17[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm25 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm0, %zmm4 @@ -12500,10 +12492,10 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm28[1],ymm25[1],ymm28[3],ymm25[3] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm8[1],ymm2[1],ymm8[3],ymm2[3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm15 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -12633,6 +12625,7 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm2, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm2, %zmm14 @@ -12644,7 +12637,6 @@ define void @load_i64_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm11 = zmm11[1],zmm10[1],zmm11[3],zmm10[3],zmm11[5],zmm10[5],zmm11[7],zmm10[7] @@ -18961,31 +18953,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: movb $-64, %al -; AVX512-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512-NEXT: kmovw %eax, %k1 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -18995,11 +18972,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -19012,7 +18994,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -20032,31 +20024,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $-64, %al -; AVX512-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512-FCP-NEXT: kmovw %eax, %k1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -20066,11 +20043,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -20083,7 +20065,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -21103,31 +21095,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: movb $-64, %al -; AVX512DQ-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-NEXT: kmovw %eax, %k1 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -21137,11 +21114,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -21154,7 +21136,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -22174,31 +22166,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: movb $-64, %al -; AVX512DQ-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -22208,11 +22185,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -22225,7 +22207,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -23245,31 +23237,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: movb $-64, %al -; AVX512BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -23279,11 +23256,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -23296,7 +23278,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -24316,31 +24308,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $-64, %al -; AVX512BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -24350,11 +24327,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -24367,7 +24349,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -25387,31 +25379,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: movb $-64, %al -; AVX512DQ-BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -25421,11 +25398,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -25438,7 +25420,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1 @@ -26458,31 +26450,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [2,10,2,10,2,10,2,10] ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] @@ -26492,11 +26469,16 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm7[0],ymm3[0],ymm7[2],ymm3[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm2, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} @@ -26509,7 +26491,17 @@ define void @load_i64_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm19[0],ymm22[0],ymm19[2],ymm22[2] ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm3[2,3],ymm0[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll index 429758c835065..a03a0536c78cb 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -1038,7 +1038,6 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa 32(%rdi), %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: pandn %xmm4, %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] @@ -1052,8 +1051,8 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: por %xmm7, %xmm0 ; SSE-NEXT: pxor %xmm9, %xmm9 ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15] +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] @@ -1068,16 +1067,13 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] ; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; SSE-NEXT: pand %xmm7, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: movdqa 48(%rdi), %xmm12 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] ; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0] @@ -1087,6 +1083,10 @@ define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: packuswb %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 +; SSE-NEXT: movdqa 48(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm7, %xmm0 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll index 60fcf25b507b7..1cd705b12a57e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -1125,8 +1125,9 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm1 +; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 @@ -1134,7 +1135,6 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: packuswb %xmm0, %xmm7 ; SSE-NEXT: packuswb %xmm1, %xmm7 -; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm8, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 @@ -1960,7 +1960,6 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $600, %rsp # imm = 0x258 ; SSE-NEXT: movdqa 32(%rdi), %xmm15 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm14 ; SSE-NEXT: movdqa 128(%rdi), %xmm4 ; SSE-NEXT: movdqa 144(%rdi), %xmm7 @@ -1977,8 +1976,8 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm5 ; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 -; SSE-NEXT: packuswb %xmm13, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -1988,21 +1987,22 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm1 -; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 +; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm8 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: packuswb %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm1 ; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: movdqa 16(%rdi), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pand %xmm9, %xmm0 ; SSE-NEXT: movdqa (%rdi), %xmm11 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll index c215e2dd9f4d9..c5f1742538c12 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -2126,10 +2126,10 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 32(%rdi), %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa 48(%rdi), %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 48(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -4084,12 +4084,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 176(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa 208(%rdi), %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm14 @@ -4791,10 +4791,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm8, %xmm15 ; SSE-NEXT: pand %xmm8, %xmm13 ; SSE-NEXT: pand %xmm8, %xmm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll index c7b73198c7f4d..a696b0b290688 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll @@ -2587,8 +2587,10 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm14, %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0] +; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pandn %xmm5, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm1 @@ -2674,8 +2676,6 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 160(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm5 -; SSE-NEXT: pandn %xmm14, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm12 ; SSE-NEXT: movdqa %xmm11, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm4 @@ -4662,9 +4662,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 80(%rdi), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdi), %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535] @@ -4673,11 +4670,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0] ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: movdqa (%rdi), %xmm7 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4881,7 +4881,6 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa 112(%rdi), %xmm6 -; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 @@ -4892,6 +4891,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pandn %xmm13, %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pand %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: movdqa %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index 48ef75742eccb..d38fc38cb104f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -1087,6 +1087,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, %xmm2 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm3 +; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: pandn %xmm10, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm3 @@ -1096,7 +1097,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm10, %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5] @@ -1894,10 +1894,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm3, %xmm9 ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: movdqa %xmm6, %xmm11 -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 -; SSE-NEXT: movdqa 96(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535] @@ -1917,11 +1915,8 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm3, %xmm1 ; SSE-NEXT: pandn %xmm7, %xmm1 -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm2 -; SSE-NEXT: movdqa %xmm4, %xmm5 ; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm13 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pxor %xmm6, %xmm6 @@ -1931,8 +1926,12 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7] ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm15, %xmm2 -; SSE-NEXT: movdqa %xmm15, %xmm3 +; SSE-NEXT: movdqa 96(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm13 +; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] @@ -3620,16 +3619,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: subq $648, %rsp # imm = 0x288 ; SSE-NEXT: movdqa 208(%rdi), %xmm14 ; SSE-NEXT: movdqa 112(%rdi), %xmm4 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 176(%rdi), %xmm6 ; SSE-NEXT: movdqa 128(%rdi), %xmm3 ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm9 @@ -3646,7 +3643,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa 192(%rdi), %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 @@ -3665,7 +3661,6 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] ; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: por %xmm4, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] @@ -3676,6 +3671,10 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm1 +; SSE-NEXT: movdqa 192(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 176(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm2 @@ -3942,11 +3941,11 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 +; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: movdqa %xmm12, %xmm5 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm1, %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3] -; SSE-NEXT: movdqa %xmm12, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: por %xmm0, %xmm1 @@ -7221,35 +7220,35 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 176(%rdi), %xmm8 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rdi), %xmm3 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm1 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa 160(%rdi), %xmm6 ; SSE-NEXT: pandn %xmm1, %xmm0 +; SSE-NEXT: movdqa 160(%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: por %xmm0, %xmm1 +; SSE-NEXT: pxor %xmm6, %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: movdqa 112(%rdi), %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] +; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6] +; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE-NEXT: movdqa %xmm11, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm1 +; SSE-NEXT: movdqa 112(%rdi), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7] ; SSE-NEXT: packuswb %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255] -; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535] -; SSE-NEXT: movdqa %xmm11, %xmm1 -; SSE-NEXT: pandn %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm11, %xmm3 ; SSE-NEXT: por %xmm1, %xmm3 @@ -7855,13 +7854,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm9, %xmm4 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm2 +; SSE-NEXT: movdqa %xmm12, %xmm1 +; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm9, %xmm1 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3] +; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pand %xmm15, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 @@ -9479,12 +9478,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128] ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm14 ; AVX-NEXT: vpor %xmm5, %xmm14, %xmm5 +; AVX-NEXT: vmovdqa %xmm12, %xmm14 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2 +; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm13, %xmm9 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm2 -; AVX-NEXT: vmovdqa %xmm12, %xmm14 -; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpshufb %xmm13, %xmm13, %xmm5 ; AVX-NEXT: vmovdqa %xmm1, %xmm12 ; AVX-NEXT: vpor %xmm5, %xmm2, %xmm1 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2 @@ -9936,8 +9935,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX-NEXT: # xmm9 = mem[0,0] ; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10 ; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7 -; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10 ; AVX-NEXT: vmovdqa %xmm0, %xmm11 +; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3 @@ -10148,22 +10147,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2: # %bb.0: ; AVX2-NEXT: subq $760, %rsp # imm = 0x2F8 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -10171,9 +10166,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 ; AVX2-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -10182,6 +10175,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa %ymm7, %ymm0 @@ -10687,22 +10686,18 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $760, %rsp # imm = 0x2F8 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6 -; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4 -; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5 +; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 -; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0 @@ -10710,9 +10705,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -10721,6 +10714,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm0 @@ -11226,30 +11225,23 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0] ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3 -; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u] ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm11 -; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9] @@ -11258,6 +11250,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0] ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm0 +; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10 +; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7 @@ -11267,9 +11266,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm6 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0 -; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm2 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5 @@ -13521,6 +13519,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4} ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -13573,7 +13572,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3} ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -13894,6 +13892,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -13913,7 +13912,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 @@ -14138,13 +14136,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7 +; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 @@ -14254,6 +14252,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 +; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4} ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -14306,7 +14305,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3} ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2 -; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -14624,6 +14622,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero ; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4} ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] @@ -14643,7 +14642,6 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15 @@ -14868,13 +14866,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3 +; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll index 72be7f0399fd5..75d0538740385 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll @@ -3793,10 +3793,6 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE: # %bb.0: ; SSE-NEXT: subq $904, %rsp # imm = 0x388 ; SSE-NEXT: movdqa 128(%rdi), %xmm6 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 80(%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 144(%rdi), %xmm13 ; SSE-NEXT: movdqa 160(%rdi), %xmm11 ; SSE-NEXT: movdqa 176(%rdi), %xmm14 @@ -3810,8 +3806,8 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm8, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm1 +; SSE-NEXT: packuswb %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm7, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm2 @@ -3822,8 +3818,6 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm11, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa 96(%rdi), %xmm12 -; SSE-NEXT: movdqa 64(%rdi), %xmm5 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -3831,11 +3825,17 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm15 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] +; SSE-NEXT: movdqa 64(%rdi), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 80(%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rdi), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 112(%rdi), %xmm15 +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm15, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 @@ -8620,16 +8620,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: movdqa 192(%rdi), %xmm3 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 224(%rdi), %xmm9 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 240(%rdi), %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0] ; SSE-NEXT: movdqa %xmm12, %xmm0 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: movdqa 208(%rdi), %xmm2 ; SSE-NEXT: packuswb %xmm0, %xmm1 ; SSE-NEXT: packuswb %xmm1, %xmm0 +; SSE-NEXT: movdqa 208(%rdi), %xmm2 +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm2 @@ -8648,10 +8648,10 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: packuswb %xmm0, %xmm3 ; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: packuswb %xmm3, %xmm3 -; SSE-NEXT: movdqa 112(%rdi), %xmm14 ; SSE-NEXT: packuswb %xmm0, %xmm3 +; SSE-NEXT: movdqa 112(%rdi), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm2[0,3] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm14, %xmm0 @@ -14311,7 +14311,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] @@ -14324,6 +14323,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm4 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm13 @@ -14837,6 +14837,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -14854,8 +14855,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -14866,10 +14867,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 @@ -15626,7 +15626,6 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm9 -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm11 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm15 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] @@ -15639,6 +15638,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1} ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm4 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm30 +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm9 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm13 @@ -16152,6 +16152,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm6 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -16169,8 +16170,8 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm28 ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm4 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] @@ -16181,10 +16182,9 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] +; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm16 -; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm10 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm20 @@ -16459,34 +16459,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vmovdqa 448(%rdi), %xmm7 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512BW-NEXT: vpmovqb %ymm4, %xmm4 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512BW-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512BW-NEXT: vmovaps 368(%rdi), %xmm2 -; AVX512BW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-NEXT: vmovdqa64 352(%rdi), %xmm27 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512BW-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 ; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512BW-NEXT: vmovdqa %xmm2, %xmm9 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa64 352(%rdi), %xmm27 +; AVX512BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 +; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512BW-NEXT: vmovdqa %xmm2, %xmm9 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] ; AVX512BW-NEXT: vpmovqb %zmm1, %xmm11 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 @@ -17324,19 +17324,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} @@ -17586,34 +17586,34 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8] ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm6, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm26 -; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %xmm7 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm30 +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm4 ; AVX512DQ-BW-NEXT: vpmovqb %ymm4, %xmm4 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7] ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX512DQ-BW-NEXT: vmovdqa 368(%rdi), %xmm2 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm4 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm31 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 -; AVX512DQ-BW-NEXT: vmovaps 368(%rdi), %xmm2 -; AVX512DQ-BW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %xmm27 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 -; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] ; AVX512DQ-BW-NEXT: vmovdqa 336(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %xmm2 ; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm15 -; AVX512DQ-BW-NEXT: vmovdqa %xmm2, %xmm9 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3] +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %xmm27 +; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6 +; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa %xmm2, %xmm9 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] ; AVX512DQ-BW-NEXT: vpmovqb %zmm1, %xmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 @@ -18451,19 +18451,19 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm5, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11] ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm28, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %xmm1, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm18, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm11 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll index 9fd7862fdc368..e7a6bfc48760e 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll @@ -1163,13 +1163,9 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride3_vf32: ; SSE: # %bb.0: ; SSE-NEXT: movdqa 16(%rdi), %xmm6 -; SSE-NEXT: movdqa 32(%rdi), %xmm4 ; SSE-NEXT: movdqa 48(%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rsi), %xmm7 -; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: movdqa 48(%rsi), %xmm11 ; SSE-NEXT: movdqa 48(%rdx), %xmm12 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] ; SSE-NEXT: movdqa %xmm0, %xmm9 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1183,11 +1179,13 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,2,2] -; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa 32(%rdx), %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: por %xmm3, %xmm0 +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 32(%rdi), %xmm0 +; SSE-NEXT: movdqa 16(%rsi), %xmm7 +; SSE-NEXT: movdqa 32(%rsi), %xmm8 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0] ; SSE-NEXT: movdqa %xmm5, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -1198,19 +1196,19 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,7,5,4,5] ; SSE-NEXT: pand %xmm5, %xmm9 ; SSE-NEXT: por %xmm3, %xmm9 -; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,2,2] -; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: movdqa 32(%rdx), %xmm3 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2] +; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pand %xmm2, %xmm4 -; SSE-NEXT: movdqa %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,2,2] +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,2] ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm1, %xmm10 ; SSE-NEXT: por %xmm4, %xmm10 @@ -2829,7 +2827,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] @@ -2850,6 +2847,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm13, %ymm7 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-FP-NEXT: vmovdqa 48(%rsi), %xmm8 @@ -2860,8 +2858,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FP-NEXT: vpermd %ymm5, %ymm10, %ymm15 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 @@ -2973,7 +2970,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 80(%rsi), %xmm5 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] -; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2] @@ -2994,6 +2990,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm7 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm7, %ymm3 +; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 ; AVX2-FCP-NEXT: vmovdqa 48(%rsi), %xmm8 @@ -3004,8 +3001,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm8 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm7, %ymm7 -; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm15 +; AVX2-FCP-NEXT: vpermd %ymm5, %ymm10, %ymm15 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm7, %ymm15, %ymm7 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm15 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm0 @@ -3108,17 +3104,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] ; AVX512-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm11 @@ -3136,8 +3130,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3152,7 +3146,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3163,12 +3156,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] +; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3200,6 +3191,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpternlogq $248, %zmm15, %zmm0, %zmm5 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512-NEXT: vpor %ymm0, %ymm6, %ymm0 @@ -3218,20 +3212,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm7 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512-NEXT: vpternlogq $248, %zmm19, %zmm0, %zmm6 -; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512-NEXT: vprold $16, %xmm2, %xmm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; AVX512-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] +; AVX512-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] @@ -3260,17 +3253,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 @@ -3288,8 +3279,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3304,7 +3295,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3315,12 +3305,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] +; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3352,6 +3340,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpternlogq $248, %zmm15, %zmm0, %zmm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 @@ -3370,20 +3361,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512-FCP-NEXT: vpternlogq $248, %zmm19, %zmm0, %zmm6 -; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512-FCP-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512-FCP-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] +; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] @@ -3412,17 +3402,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm11 @@ -3440,8 +3428,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3456,7 +3444,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3467,12 +3454,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] +; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3504,6 +3489,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpternlogq $248, %zmm15, %zmm0, %zmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512DQ-NEXT: vpor %ymm0, %ymm6, %ymm0 @@ -3522,20 +3510,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512DQ-NEXT: vpternlogq $248, %zmm19, %zmm0, %zmm6 -; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] +; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] @@ -3564,17 +3551,15 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7] -; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm14 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128] ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11 @@ -3592,8 +3577,8 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10 -; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm13 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15] @@ -3608,7 +3593,6 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535] ; AVX512DQ-FCP-NEXT: vpternlogd $184, %zmm0, %zmm22, %zmm10 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm5 @@ -3619,12 +3603,10 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm12 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7 -; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7] +; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5 @@ -3656,6 +3638,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm15, %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm6 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0 @@ -3674,20 +3659,19 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %zmm19, %zmm0, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm24, %xmm0 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] -; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2] +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm4 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2 -; AVX512DQ-FCP-NEXT: vprold $16, %xmm20, %xmm4 -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2] +; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm4 +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll index a52d6cc9bd3b7..51d0ba916f2af 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll @@ -3083,20 +3083,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 -; AVX512-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512-NEXT: vmovdqa 16(%rcx), %xmm5 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-NEXT: vmovdqa 48(%rcx), %xmm9 -; AVX512-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] @@ -3161,13 +3157,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3192,16 +3186,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512-NEXT: vpermt2d %zmm2, %zmm5, %zmm6 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] @@ -3237,20 +3235,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512-FCP-NEXT: vmovdqa 16(%rcx), %xmm5 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-FCP-NEXT: vmovdqa 48(%rcx), %xmm9 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512-FCP-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 @@ -3315,13 +3309,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512-FCP-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512-FCP-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3346,16 +3338,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm1 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX512-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] @@ -3391,20 +3387,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-NEXT: vmovdqa 16(%rcx), %xmm5 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm9 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,1,2,2,3,3,16,16,17,17,18,18,19,19] @@ -3469,13 +3461,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512DQ-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512DQ-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3500,16 +3490,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512DQ-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm5, %zmm6 ; AVX512DQ-NEXT: vmovdqa32 %zmm6, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] @@ -3545,20 +3539,16 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm18 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rcx), %xmm5 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rcx), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdx), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm5, %zmm8 @@ -3623,13 +3613,11 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm17 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm17, %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %xmm19 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %xmm20 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rcx), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdx), %xmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] @@ -3654,16 +3642,20 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rcx), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdx), %xmm10 ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm6, %ymm2 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm9 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] ; AVX512DQ-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll index 994c785126d25..3b160cdef312b 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll @@ -1158,13 +1158,9 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-LABEL: store_i16_stride5_vf16: ; SSE: # %bb.0: ; SSE-NEXT: movdqa 16(%rdi), %xmm5 -; SSE-NEXT: movdqa (%rsi), %xmm8 -; SSE-NEXT: movdqa (%rcx), %xmm14 ; SSE-NEXT: movdqa 16(%rsi), %xmm0 ; SSE-NEXT: movdqa 16(%rdx), %xmm10 -; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 16(%rcx), %xmm11 -; SSE-NEXT: movdqa 16(%r8), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: pandn %xmm5, %xmm2 @@ -1178,7 +1174,6 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,65535,65535,0] ; SSE-NEXT: pand %xmm9, %xmm4 -; SSE-NEXT: movdqa (%rdi), %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm11[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4] ; SSE-NEXT: movdqa %xmm9, %xmm13 @@ -1186,6 +1181,11 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm4, %xmm13 ; SSE-NEXT: pand %xmm12, %xmm13 ; SSE-NEXT: por %xmm6, %xmm13 +; SSE-NEXT: movdqa (%rdi), %xmm15 +; SSE-NEXT: movdqa (%rsi), %xmm8 +; SSE-NEXT: movdqa (%rcx), %xmm14 +; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm2, %xmm13 ; SSE-NEXT: movdqa %xmm2, %xmm1 @@ -2489,10 +2489,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm6, %xmm11 +; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,6,7] -; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm3[0],xmm11[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] @@ -2837,10 +2837,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2: # %bb.0: ; AVX2-NEXT: subq $72, %rsp ; AVX2-NEXT: vmovdqa (%rdx), %xmm6 -; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 -; AVX2-NEXT: vmovdqa (%r8), %ymm1 -; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-NEXT: vmovdqa (%rcx), %xmm7 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9 @@ -2851,19 +2847,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqa (%rsi), %xmm12 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm10 ; AVX2-NEXT: vmovdqa (%rdi), %xmm11 -; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] -; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,0,0,0,65535,65535,0,0,0,65535] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-NEXT: vpbroadcastq (%r8), %ymm11 +; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535] ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm11, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] @@ -2873,13 +2865,24 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,1,3] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,6] ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1] +; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-NEXT: vpshufb %xmm14, %xmm9, %xmm9 +; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] +; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4 +; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX2-NEXT: vmovdqa (%r8), %ymm1 +; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpbroadcastq 32(%r8), %ymm5 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13] ; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm0 ; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm12 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb %xmm14, %xmm7, %xmm7 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2] ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7] @@ -2898,15 +2901,10 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm11 ; AVX2-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm11[1],xmm10[2,3],xmm11[4],xmm10[5],xmm11[6],xmm10[7] ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm10 -; AVX2-NEXT: vpshufb %xmm14, %xmm9, %xmm9 ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm11 -; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,2] -; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm8[1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7] ; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 -; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm15[0,1,0,1] -; AVX2-NEXT: vmovdqa (%rsi), %ymm5 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,0] ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm9, %ymm9 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm10[3,2,3,3,7,6,7,7] @@ -2918,10 +2916,11 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm13, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1] ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm13, %ymm0 +; AVX2-NEXT: vmovdqa (%rsi), %ymm9 +; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,3,2,3,6,7,6,7] -; AVX2-NEXT: vmovdqa %ymm5, %ymm9 -; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm5[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm9[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,2,6,7,6,6] ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] ; AVX2-NEXT: vmovdqa (%rcx), %ymm13 @@ -3622,7 +3621,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] -; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] @@ -3641,8 +3639,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -3936,7 +3934,6 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm5 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm9[3,2,3,3,7,6,7,7] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2],ymm12[3,4],ymm5[5,6,7,8],ymm12[9],ymm5[10],ymm12[11,12],ymm5[13,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %xmm22 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[1,1,1,2,5,5,5,6] ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5],ymm8[6],ymm1[7,8],ymm8[9],ymm1[10,11],ymm8[12],ymm1[13],ymm8[14],ymm1[15] @@ -3955,8 +3952,8 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2],ymm4[3,4],ymm2[5,6,7,8],ymm4[9],ymm2[10],ymm4[11,12],ymm2[13,14,15] ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,3,2,8,9,8,9] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 @@ -4583,13 +4580,13 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm12, %xmm11 ; SSE-NEXT: movdqa %xmm12, %xmm0 +; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] ; SSE-NEXT: pand %xmm9, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm14, %xmm12 ; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] ; SSE-NEXT: movdqa %xmm3, %xmm7 ; SSE-NEXT: movdqa %xmm3, %xmm14 @@ -4978,10 +4975,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm4 +; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm13 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll index b2a270adbb359..c42bb48d39f15 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -1712,23 +1712,17 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $24, %rsp ; AVX2-FP-NEXT: vmovaps (%r9), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm8 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm7 +; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm11 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm5 -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,1,1] ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm4 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,2,3,3] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] @@ -1740,9 +1734,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm9 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FP-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm9 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm10 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm8 +; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm4 ; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm10[4],ymm0[5],ymm10[5],ymm0[6],ymm10[6],ymm0[7],ymm10[7],ymm0[12],ymm10[12],ymm0[13],ymm10[13],ymm0[14],ymm10[14],ymm0[15],ymm10[15] +; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[3,3,3,3] ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10 @@ -2038,15 +2037,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] -; AVX512-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] -; AVX512-FCP-NEXT: vmovdqa64 (%r8), %ymm16 ; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11 @@ -2056,10 +2052,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] @@ -2068,7 +2068,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 @@ -2081,7 +2080,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 @@ -2095,7 +2093,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] ; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] -; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] +; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[1,2,2,3,5,6,6,7] ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 @@ -2205,15 +2203,12 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10] -; AVX512DQ-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %ymm16 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11 @@ -2223,10 +2218,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11] ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7] ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15] @@ -2235,7 +2234,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7 @@ -2248,7 +2246,6 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6 @@ -2262,7 +2259,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7] -; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7] +; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[1,2,2,3,5,6,6,7] ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11] ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7] ; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8 @@ -2419,21 +2416,20 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rcx), %xmm10 ; SSE-NEXT: movdqa (%r8), %xmm8 ; SSE-NEXT: movdqa %xmm5, %xmm0 -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: movdqa (%r9), %xmm11 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,3],xmm0[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,2],xmm7[0,1] +; SSE-NEXT: movdqa (%r9), %xmm7 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm6, %xmm9 -; SSE-NEXT: movdqa %xmm11, %xmm7 -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,2,1] ; SSE-NEXT: movaps %xmm6, %xmm0 ; SSE-NEXT: andnps %xmm11, %xmm0 @@ -5189,21 +5185,20 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 16(%rcx), %xmm3 ; SSE-NEXT: movdqa (%r8), %xmm9 ; SSE-NEXT: movdqa %xmm12, %xmm0 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] ; SSE-NEXT: movdqa %xmm10, %xmm7 -; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm0[3,3] ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,1,3,3,4,5,6,7] ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,2],xmm5[0,1] +; SSE-NEXT: movdqa (%r9), %xmm5 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0,1,3] ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,65535,0] ; SSE-NEXT: andps %xmm14, %xmm7 -; SSE-NEXT: movdqa %xmm8, %xmm5 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm5[0,2,2,3,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,1] ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: andnps %xmm8, %xmm0 @@ -6623,13 +6618,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] -; AVX2-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-NEXT: vmovdqa (%r8), %xmm1 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vmovdqa 32(%r8), %xmm4 -; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7] ; AVX2-NEXT: vmovdqa (%r9), %xmm0 @@ -6641,6 +6633,9 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-NEXT: vmovdqa 32(%r8), %xmm4 +; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] @@ -9245,27 +9240,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm2 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm3 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 -; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm7 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,2,1] -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm5 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5] +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm8 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm29 ; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,0,1] @@ -9276,18 +9266,23 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7] ; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero -; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm31 ; AVX512DQ-NEXT: vpbroadcastq %xmm3, %ymm3 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm29 +; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm31 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm7 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm0, %zmm2 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,2,1] ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll index 1d1c4de793b6d..0dc07bf4e39da 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -3565,34 +3565,34 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa 48(%rsi), %xmm2 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rdx), %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 48(%rcx), %xmm5 ; SSE-NEXT: movdqa 48(%r8), %xmm9 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: movaps 48(%rax), %xmm7 -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm1, %xmm10 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm6, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[3,3,3,3,4,5,6,7] ; SSE-NEXT: movdqa %xmm5, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 48(%r9), %xmm4 ; SSE-NEXT: movdqa %xmm3, %xmm5 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa 48(%r9), %xmm4 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 48(%rax), %xmm7 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm2 @@ -4799,27 +4799,27 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4] -; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm1 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0] ; AVX2-NEXT: vpermd %ymm5, %ymm2, %ymm4 +; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 32(%r8), %ymm6 +; AVX2-NEXT: vmovdqa 32(%r9), %ymm7 ; AVX2-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7] ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15] ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm2 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2 @@ -7721,46 +7721,47 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 112(%rdx), %xmm1 ; SSE-NEXT: movdqa 112(%rcx), %xmm6 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa 112(%r8), %xmm4 -; SSE-NEXT: movaps 112(%rax), %xmm7 -; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2] ; SSE-NEXT: movdqa %xmm1, %xmm10 +; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm13, %xmm1 -; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] ; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa 96(%rcx), %xmm12 +; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7] +; SSE-NEXT: movdqa %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; SSE-NEXT: movdqa 112(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE-NEXT: movdqa 96(%rdx), %xmm5 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 +; SSE-NEXT: movdqa 112(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm4, %xmm9 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: psrld $16, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm15[0,2] +; SSE-NEXT: movdqa 96(%rdx), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 112(%rax), %xmm7 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2] ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535] ; SSE-NEXT: andps %xmm1, %xmm4 ; SSE-NEXT: andnps %xmm7, %xmm1 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: orps %xmm4, %xmm1 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 96(%rcx), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm12, %xmm1 ; SSE-NEXT: psrlq $48, %xmm1 ; SSE-NEXT: movdqa %xmm5, %xmm3 @@ -8134,15 +8135,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm0, %xmm3 -; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] ; SSE-NEXT: movdqa %xmm0, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm6 @@ -8846,6 +8847,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; SSE-NEXT: movdqa %xmm10, %xmm13 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1] ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: andnps %xmm1, %xmm2 @@ -8882,7 +8884,6 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE-NEXT: movaps %xmm10, %xmm13 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: psrld $16, %xmm1 @@ -9022,7 +9023,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm13, %xmm7 +; SSE-NEXT: movdqa %xmm13, %xmm7 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; SSE-NEXT: # xmm7 = xmm7[1],mem[0] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -15366,8 +15367,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-NEXT: kmovd %ecx, %k3 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -15592,8 +15593,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -15818,8 +15819,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] @@ -16044,8 +16045,8 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3} ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 64f5761b31d64..aa541711601b1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -2261,11 +2261,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-NEXT: vmovdqa (%r9), %ymm11 -; AVX512-NEXT: vmovdqa (%r10), %ymm12 -; AVX512-NEXT: vmovdqa (%rax), %ymm13 ; AVX512-NEXT: vmovdqa (%rax), %xmm0 ; AVX512-NEXT: vmovdqa (%r10), %xmm2 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -2274,10 +2269,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%r9), %xmm0 ; AVX512-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] ; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vmovdqa (%r8), %ymm9 +; AVX512-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-NEXT: vmovdqa (%r10), %ymm12 +; AVX512-NEXT: vmovdqa (%rax), %ymm13 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] @@ -2298,7 +2298,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] ; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} @@ -2332,11 +2331,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP: # %bb.0: ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm12 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm13 ; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm2 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -2345,10 +2339,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm0 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] ; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm12 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm13 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] @@ -2369,7 +2368,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} @@ -2403,11 +2401,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm11 -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm12 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 ; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%r10), %xmm2 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -2416,10 +2409,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm12 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] @@ -2440,7 +2438,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} @@ -2474,11 +2471,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP: # %bb.0: ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm20 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm13 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -2487,10 +2479,15 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm0 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm9 +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm24 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] @@ -2511,7 +2508,6 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm2 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm14 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll index 22508e2ccfc79..5017bd19a3e9f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -1322,24 +1322,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-LABEL: store_i32_stride2_vf64: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] +; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-NEXT: vmovaps 128(%rsi), %ymm12 -; AVX2-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-NEXT: vmovaps 64(%rsi), %ymm13 ; AVX2-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] -; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] ; AVX2-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] @@ -1366,8 +1366,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] ; AVX2-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] @@ -1394,24 +1393,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-FP-LABEL: store_i32_stride2_vf64: ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-FP-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] +; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FP-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-FP-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-FP-NEXT: vmovaps 128(%rsi), %ymm12 -; AVX2-FP-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-FP-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-FP-NEXT: vmovaps 64(%rsi), %ymm13 ; AVX2-FP-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] -; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] @@ -1438,8 +1437,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] ; AVX2-FP-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-FP-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] ; AVX2-FP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] @@ -1466,24 +1464,24 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-FCP-LABEL: store_i32_stride2_vf64: ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1 -; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9 -; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm8 +; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2 +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] +; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm8 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm7 +; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm9 ; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm10 ; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm11 ; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm12 -; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm5 ; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm13 ; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm14 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] -; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm15[2,3] -; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm15[0,1] ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] @@ -1510,8 +1508,7 @@ define void @store_i32_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.v ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm15[2,3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[0,1],ymm15[0,1] ; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm15 -; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm15[2],ymm0[3],ymm15[3],ymm0[6],ymm15[6],ymm0[7],ymm15[7] +; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm0[2,3] ; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[0,1],ymm0[0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll index 7d636b2d8aa3b..62706b8226ccf 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -715,18 +715,18 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps 32(%rsi), %xmm10 ; SSE-NEXT: movaps 48(%rsi), %xmm11 ; SSE-NEXT: movaps 48(%rdx), %xmm8 -; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movaps %xmm5, %xmm12 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm11[1] ; SSE-NEXT: movaps %xmm5, %xmm13 ; SSE-NEXT: movaps %xmm5, %xmm6 -; SSE-NEXT: movaps 16(%rdx), %xmm0 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,3],xmm11[3,3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,1],xmm8[1,1] -; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm8[0,3] +; SSE-NEXT: movaps 16(%rdx), %xmm8 +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm12[0,2] ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm13[2,0] ; SSE-NEXT: movaps %xmm4, %xmm13 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm10[1] @@ -745,10 +745,9 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, %xmm13 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm9[3,3] -; SSE-NEXT: movaps %xmm0, %xmm8 -; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,1],xmm8[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm0[0,3] +; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[1,1],xmm8[0,3] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,0] ; SSE-NEXT: movaps %xmm1, %xmm14 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm7[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index ede8586545e49..42a15b1d90e53 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -1335,8 +1335,6 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-LABEL: store_i32_stride5_vf16: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rsi), %xmm6 -; AVX-NEXT: vmovaps 32(%rdi), %ymm1 -; AVX-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX-NEXT: vmovaps (%rdi), %xmm10 ; AVX-NEXT: vmovaps 32(%rdi), %xmm9 @@ -1344,12 +1342,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm10[0],xmm6[0],zero,zero ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX-NEXT: vmovaps (%rdx), %xmm11 -; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] ; AVX-NEXT: vmovaps (%rcx), %xmm12 +; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm12[0],xmm11[0] ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,0] ; AVX-NEXT: vbroadcastss 4(%rdx), %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7 -; AVX-NEXT: vmovaps (%rdi), %ymm0 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6],ymm7[7] ; AVX-NEXT: vinsertf128 $1, (%r8), %ymm5, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6,7] @@ -1364,10 +1361,11 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vinsertps {{.*#+}} xmm15 = xmm9[0],xmm8[0],zero,zero ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm15, %ymm5 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5,6],ymm7[7] -; AVX-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX-NEXT: vinsertf128 $1, 32(%r8), %ymm15, %ymm15 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm5[1,2,3],ymm15[4],ymm5[5,6,7] -; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vmovaps (%rdi), %ymm5 +; AVX-NEXT: vmovaps 32(%rdi), %ymm1 +; AVX-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhps {{.*#+}} xmm11 = xmm11[2],xmm12[2],xmm11[3],xmm12[3] ; AVX-NEXT: vbroadcastss 4(%rcx), %xmm12 @@ -1377,6 +1375,8 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7] ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm10 = mem[0,1,0,1] ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7] +; AVX-NEXT: vmovaps 32(%rdx), %ymm2 +; AVX-NEXT: vmovaps 32(%rcx), %ymm7 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 48(%rsi), %xmm10 ; AVX-NEXT: vmovaps 48(%rdi), %xmm11 @@ -1415,8 +1415,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm9[2],xmm13[3],xmm9[3] ; AVX-NEXT: vmovaps 16(%rsi), %xmm13 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 -; AVX-NEXT: vmovaps %ymm0, %ymm5 -; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm0[3,3],ymm15[7,7],ymm0[7,7] +; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[3,3],ymm5[3,3],ymm15[7,7],ymm5[7,7] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX-NEXT: vmovaps 16(%rdi), %xmm14 @@ -4775,10 +4774,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps (%rcx), %xmm5 ; SSE-NEXT: movaps 16(%rcx), %xmm9 ; SSE-NEXT: movaps (%r8), %xmm4 -; SSE-NEXT: movaps 32(%r8), %xmm14 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps 16(%r8), %xmm11 -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm4[3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3] @@ -4788,15 +4784,18 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm11[3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,3,3,3] -; SSE-NEXT: movaps 32(%rcx), %xmm12 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,2] -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3] ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE-NEXT: movaps 32(%rcx), %xmm12 +; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm12, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm14[3,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2] @@ -5311,14 +5310,14 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: movaps 240(%rdi), %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; SSE-NEXT: movaps %xmm5, %xmm6 +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm5, %xmm0 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm1[0] ; SSE-NEXT: movaps %xmm3, %xmm15 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll index 31d7791d674a4..64aa6ddb85317 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -10091,46 +10091,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -10147,21 +10135,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 +; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -10659,46 +10659,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -10715,21 +10703,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11227,46 +11227,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -11283,21 +11271,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -11795,46 +11795,34 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,16,0,0,2,18,1,17,0,16,0,0,2,18,1,17] ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] -; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm7, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21] ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23] ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm31 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [8,24,0,0,10,26,9,25,8,24,0,0,10,26,9,25] ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm11, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29] ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm30 = [14,30,15,31,14,30,15,31,14,30,15,31,14,30,15,31] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm30, %zmm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm7, %zmm0 @@ -11851,21 +11839,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm12, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rsi), %zmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rsi), %zmm23 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm30, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [1,17,0,16,0,0,2,18,1,17,0,16,0,0,2,18] +; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm20, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm7, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm26 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm11, %zmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm12, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm30, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll index 275f36005f1ee..af60bf28554b0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -1780,43 +1780,42 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $520, %rsp # imm = 0x208 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdx), %xmm5 -; SSE-NEXT: movaps (%rcx), %xmm8 -; SSE-NEXT: movaps 16(%r8), %xmm14 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movaps (%rcx), %xmm8 ; SSE-NEXT: movaps (%r8), %xmm15 -; SSE-NEXT: movdqa (%r9), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm2 -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm10 -; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm3 -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm11 -; SSE-NEXT: movdqa 16(%rsi), %xmm6 ; SSE-NEXT: movaps %xmm15, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm8[1,1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm1[0],xmm5[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0] +; SSE-NEXT: movdqa (%rdi), %xmm10 +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rdx), %xmm9 +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm13 +; SSE-NEXT: movdqa (%rax), %xmm11 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm10, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE-NEXT: movdqa %xmm4, %xmm12 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] -; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] -; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1] +; SSE-NEXT: movdqa 16(%rsi), %xmm9 +; SSE-NEXT: movaps 16(%rcx), %xmm2 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 16(%r8), %xmm14 +; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm3 +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm12 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -1828,8 +1827,8 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm4 ; SSE-NEXT: movdqa %xmm4, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 @@ -1918,12 +1917,12 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0] +; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1] ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm10[0,3] ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: movaps %xmm12, %xmm11 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4016,39 +4015,40 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa (%rdi), %xmm8 ; SSE-NEXT: movdqa (%rsi), %xmm10 ; SSE-NEXT: movaps (%rdx), %xmm14 -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%rcx), %xmm9 ; SSE-NEXT: movaps (%r8), %xmm0 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm15 -; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm12 -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rsi), %xmm4 -; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%rax), %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1] ; SSE-NEXT: movaps %xmm14, %xmm3 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0] ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] +; SSE-NEXT: movdqa %xmm8, %xmm0 +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm10[2],xmm0[3],xmm10[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 16(%r8), %xmm11 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm8[0] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 16(%r8), %xmm11 +; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm12 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm11, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -4060,15 +4060,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE-NEXT: movdqa 16(%rdi), %xmm15 ; SSE-NEXT: movdqa %xmm15, %xmm0 -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm5 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm5 +; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm5, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm5 @@ -4119,10 +4119,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm12 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm3 @@ -6797,8 +6797,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512-NEXT: kmovw %ecx, %k2 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7000,8 +7000,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512-FCP-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512-FCP-NEXT: kmovw %ecx, %k2 ; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7203,8 +7203,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512DQ-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-NEXT: kmovw %ecx, %k2 ; AVX512DQ-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7406,8 +7406,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-FCP-NEXT: kmovw %ecx, %k2 ; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7609,8 +7609,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-NEXT: kmovd %ecx, %k2 ; AVX512BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -7812,8 +7812,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8015,8 +8015,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8218,8 +8218,8 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm25, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,0,0,4,5,16,0,0,0,0,11,12,17,0,0] ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $14448, %cx # imm = 0x3870 +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k2} ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm7 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm5 @@ -8381,43 +8381,43 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: subq $2760, %rsp # imm = 0xAC8 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE-NEXT: movdqa (%rsi), %xmm4 -; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rdx), %xmm2 -; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%rdx), %xmm7 -; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps (%rcx), %xmm13 -; SSE-NEXT: movaps 16(%rcx), %xmm9 -; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rdi), %xmm6 ; SSE-NEXT: movaps (%r8), %xmm0 -; SSE-NEXT: movaps 16(%r8), %xmm10 -; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%r9), %xmm12 -; SSE-NEXT: movdqa 16(%r9), %xmm8 -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa (%rax), %xmm15 +; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm13[1,1] -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0] +; SSE-NEXT: movdqa (%rdi), %xmm6 +; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%rsi), %xmm3 +; SSE-NEXT: movdqa 16(%rdx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 16(%rcx), %xmm9 +; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa (%r9), %xmm12 +; SSE-NEXT: movdqa (%rax), %xmm15 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] -; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: movaps 16(%r8), %xmm10 +; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r9), %xmm8 +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm10, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] @@ -8434,10 +8434,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 32(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 32(%rdx), %xmm3 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 32(%rdx), %xmm3 +; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 32(%rcx), %xmm3 @@ -8487,10 +8487,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 64(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 64(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 64(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 64(%rcx), %xmm4 @@ -8540,10 +8540,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 96(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 96(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 96(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 96(%rcx), %xmm10 @@ -8592,10 +8592,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 128(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 128(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 128(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 128(%rcx), %xmm4 @@ -8645,10 +8645,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 160(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 160(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 160(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 160(%rcx), %xmm4 @@ -8698,10 +8698,10 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa 192(%rsi), %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movaps 192(%rdx), %xmm4 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps 192(%rdx), %xmm4 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm4, %xmm1 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE-NEXT: movaps 192(%rcx), %xmm4 @@ -9646,20 +9646,19 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX-NEXT: subq $3432, %rsp # imm = 0xD68 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX-NEXT: vmovaps 224(%rdx), %ymm0 -; AVX-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX-NEXT: vmovaps 224(%r8), %ymm4 -; AVX-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rcx), %ymm5 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] +; AVX-NEXT: vmovaps 224(%rdi), %ymm5 +; AVX-NEXT: vmovaps 224(%rsi), %ymm2 +; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 224(%r8), %ymm4 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 224(%rax), %ymm3 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm5[2],ymm0[3],ymm5[3],ymm0[6],ymm5[6],ymm0[7],ymm5[7] -; AVX-NEXT: vmovaps %ymm1, %ymm5 -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm2[1],ymm5[1],ymm2[3],ymm5[3] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll index bac4ff8ce434d..d01c8b91de510 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -871,24 +871,24 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm3, %xmm2 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm4 -; SSE-NEXT: movaps 16(%rax), %xmm7 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,0] -; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm14, %xmm2 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1] -; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; SSE-NEXT: movaps %xmm15, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm9, %xmm1 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE-NEXT: movaps 16(%rax), %xmm7 +; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movaps %xmm11, %xmm4 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3] ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movaps %xmm8, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll index 75b76f891d46c..5bde9571124a6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -844,26 +844,26 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; SSE-NEXT: movapd 64(%rdi), %xmm5 ; SSE-NEXT: movapd (%rdi), %xmm1 ; SSE-NEXT: movapd (%rsi), %xmm4 -; SSE-NEXT: movapd 16(%rsi), %xmm7 -; SSE-NEXT: movapd 64(%rsi), %xmm9 -; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: movapd (%rdx), %xmm0 +; SSE-NEXT: movapd %xmm1, %xmm8 +; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] +; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] +; SSE-NEXT: movapd 16(%rdi), %xmm0 ; SSE-NEXT: movapd 32(%rdi), %xmm3 -; SSE-NEXT: movapd 16(%rdi), %xmm2 +; SSE-NEXT: movapd 48(%rdi), %xmm6 +; SSE-NEXT: movapd 64(%rsi), %xmm9 +; SSE-NEXT: movapd 16(%rsi), %xmm7 ; SSE-NEXT: movapd 32(%rsi), %xmm11 ; SSE-NEXT: movapd 48(%rsi), %xmm10 ; SSE-NEXT: movapd 64(%rdx), %xmm15 -; SSE-NEXT: movapd (%rdx), %xmm0 ; SSE-NEXT: movapd 16(%rdx), %xmm12 ; SSE-NEXT: movapd 32(%rdx), %xmm13 ; SSE-NEXT: movapd 48(%rdx), %xmm14 -; SSE-NEXT: movapd %xmm1, %xmm8 -; SSE-NEXT: unpcklpd {{.*#+}} xmm8 = xmm8[0],xmm4[0] ; SSE-NEXT: movapd %xmm8, (%rsp) # 16-byte Spill -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movapd %xmm2, %xmm0 +; SSE-NEXT: movapd %xmm0, %xmm2 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm7[0] ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm12[0],xmm0[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll index d610029880f81..f27c115ef4c5f 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll @@ -6746,15 +6746,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -6766,11 +6757,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -7041,15 +7041,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -7061,11 +7052,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -7336,15 +7336,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -7356,11 +7347,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512DQ-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512DQ-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512DQ-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -7631,15 +7631,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -7651,11 +7642,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -7926,15 +7926,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -7946,11 +7937,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512BW-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -8221,15 +8221,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -8241,11 +8232,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -8516,15 +8516,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -8536,11 +8527,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512DQ-BW-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm12 @@ -8811,15 +8811,6 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdx), %zmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm14 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,0,0,8,0,0,1,9] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm13 @@ -8831,11 +8822,20 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,4,12,0,0,5,13] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,6,14,0,0,7,15] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm26, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdx), %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rcx), %zmm23 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rcx), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rcx), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rcx), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm9, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll index 78a8042b3535e..4de10147d80e5 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -7395,34 +7395,34 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX: # %bb.0: ; AVX-NEXT: subq $2264, %rsp # imm = 0x8D8 ; AVX-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX-NEXT: vmovaps 160(%rdi), %ymm4 ; AVX-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX-NEXT: vmovaps (%rcx), %ymm2 ; AVX-NEXT: vmovaps 128(%rcx), %ymm0 +; AVX-NEXT: vmovaps (%rcx), %ymm2 ; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 16(%rdx), %xmm6 -; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3] -; AVX-NEXT: vmovaps 80(%rdx), %xmm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3] -; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovaps 80(%rdx), %xmm3 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2] -; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] +; AVX-NEXT: vmovaps 192(%rdi), %ymm14 +; AVX-NEXT: vmovaps 160(%rdi), %ymm4 +; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll index 651f851f9f6f9..4eea4707e120d 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -4443,6 +4443,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -4457,7 +4458,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-FCP-NEXT: movb $112, %sil ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} @@ -4468,10 +4468,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 +; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 -; AVX512DQ-FCP-NEXT: movb $96, %sil ; AVX512DQ-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-FCP-NEXT: kmovw %esi, %k3 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} @@ -4481,6 +4481,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 @@ -4500,7 +4501,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 -; AVX512DQ-FCP-NEXT: movb $120, %sil ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 @@ -5267,6 +5267,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm28 +; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [3,0,12,4,3,0,12,4] ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] @@ -5281,7 +5282,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm22, %zmm0 -; AVX512DQ-BW-FCP-NEXT: movb $112, %sil ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm22 = xmm27[0],mem[0] ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm22, %zmm0, %zmm0 {%k3} @@ -5292,10 +5292,10 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm28, %zmm29 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm28, %zmm24 +; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm27 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm21 -; AVX512DQ-BW-FCP-NEXT: movb $96, %sil ; AVX512DQ-BW-FCP-NEXT: vinserti64x2 $3, 64(%rax), %zmm21, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: kmovd %esi, %k3 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm24 {%k3} @@ -5305,6 +5305,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm29, %zmm22 +; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm26 @@ -5324,7 +5325,6 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm12, %zmm20 -; AVX512DQ-BW-FCP-NEXT: movb $120, %sil ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm20, %zmm30 @@ -16933,57 +16933,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512-NEXT: kmovw %r10d, %k1 -; AVX512-NEXT: movb $96, %r10b -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 -; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: movb $96, %r10b +; AVX512-NEXT: kmovw %r10d, %k1 +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-NEXT: vmovdqa (%r9), %ymm7 -; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-NEXT: vmovdqa (%r8), %ymm0 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: movb $28, %r10b -; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512-NEXT: kmovw %r10d, %k2 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] @@ -16999,18 +16984,32 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm1 +; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 @@ -17895,58 +17894,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 ; AVX512-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: movb $96, %r10b -; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 ; AVX512-FCP-NEXT: kmovw %r10d, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 -; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8 ; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512-FCP-NEXT: movb $28, %r10b ; AVX512-FCP-NEXT: kmovw %r10d, %k2 -; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -17964,18 +17949,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 @@ -18854,55 +18854,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: movb $96, %r10b -; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: kmovw %r10d, %k1 -; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512DQ-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: movb $96, %r10b +; AVX512DQ-NEXT: kmovw %r10d, %k1 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-NEXT: movb $28, %r10b ; AVX512DQ-NEXT: kmovw %r10d, %k2 @@ -18915,21 +18899,37 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] ; AVX512DQ-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQ-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 @@ -19808,56 +19808,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: movb $96, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-FCP-NEXT: vmovaps (%rdx), %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-FCP-NEXT: movb $28, %r10b ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -19873,7 +19858,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] @@ -19885,7 +19869,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 @@ -20761,57 +20761,42 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512BW-NEXT: vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 -; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm15 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: movb $96, %r10b -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] -; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 -; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm0, %zmm2 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [4,9,0,3,4,9,0,3] ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm24, %zmm2 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm3 +; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm4 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [11,3,11,3,11,3,11,3] +; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: movb $96, %r10b +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm11, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm31, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm4 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: movb $28, %r10b -; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: kmovd %r10d, %k2 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] @@ -20827,18 +20812,32 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [5,0,14,6,5,0,14,6] ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9 -; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm13, %zmm0 +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm1 +; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,13,6,7,0,13,6,7] ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm2, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 -; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm1 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [6,13,14,7,6,13,14,7] ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm5 +; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19 +; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm17 +; AVX512BW-NEXT: vmovdqu64 %zmm17, (%rsp) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm18 +; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm6 +; AVX512BW-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512BW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512BW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15 +; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm1 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm11, %zmm1 @@ -21723,58 +21722,44 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512BW-FCP-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovups %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovups %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [11,3,11,3,11,3,11,3] ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm18 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,10,0,3,2,10,0,3] ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: movb $96, %r10b -; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm4 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm10 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm2 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm8 ; AVX512BW-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm7 -; AVX512BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2] ; AVX512BW-FCP-NEXT: movb $28, %r10b ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm4[2,3,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,12,0,5,4,12,0,5] ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] @@ -21792,18 +21777,33 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,13,6,7,0,13,6,7] ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm15, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm1 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, (%rsp) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm19 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa 64(%r9), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 64(%r8), %ymm7 +; AVX512BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm4 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm10[0],ymm7[2],ymm10[2] +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[2,3,2,3],zmm5[2,3,2,3] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm1 @@ -22682,55 +22682,39 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovups %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-NEXT: vmovups %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] -; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm14 -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] -; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: movb $96, %r10b -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] -; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm0, %zmm2 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm8 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm6 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,3,11,3,11,3,11,3] +; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm31 = [2,10,0,3,2,10,0,3] +; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: movb $96, %r10b +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [15,7,15,7,15,7,15,7] +; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm10, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2 +; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm31, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] ; AVX512DQ-BW-NEXT: movb $28, %r10b ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 @@ -22743,21 +22727,37 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,1,12,7,0,1,12,7] ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm17, %zmm0 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,13,6,7,0,13,6,7] ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm24, %zmm0 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm25, %zmm1 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [6,13,14,7,6,13,14,7] ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm27, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm16 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12 +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm18 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, (%rsp) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm15 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r9), %ymm5 +; AVX512DQ-BW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1 @@ -23636,56 +23636,41 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1 -; AVX512DQ-BW-FCP-NEXT: vmovups %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] +; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] +; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,3,11,3,11,3,11,3] ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [2,10,0,3,2,10,0,3] ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: movb $96, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdx), %zmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [15,7,15,7,15,7,15,7] ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [9,1,9,1,9,1,9,1] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm0, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [4,9,0,3,4,9,0,3] -; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm7 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm7[0],ymm0[2],ymm7[2] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 ; AVX512DQ-BW-FCP-NEXT: movb $28, %r10b ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k2} = zmm0[2,3,2,3],zmm3[2,3,2,3] @@ -23701,7 +23686,6 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [5,0,14,6,5,0,14,6] ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,13,6,7,0,13,6,7] @@ -23713,7 +23697,23 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [6,13,14,7,6,13,14,7] ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3] ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r9), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%r8), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm8, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll index 5c005567db232..b98e79aa88867 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-8.ll @@ -18633,12 +18633,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-NEXT: movb $-64, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 @@ -18674,20 +18672,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -18715,8 +18711,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18725,9 +18728,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -18738,9 +18738,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18749,9 +18749,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -18760,13 +18760,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -18775,9 +18775,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -19580,12 +19580,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512-FCP-NEXT: movb $-64, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 @@ -19621,20 +19619,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -19662,8 +19658,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512-FCP-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19672,9 +19675,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -19685,9 +19685,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19696,9 +19696,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19707,13 +19707,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -19722,9 +19722,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -20527,12 +20527,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512DQ-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-NEXT: movb $-64, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 @@ -20568,20 +20566,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -20609,8 +20605,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512DQ-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512DQ-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20619,9 +20622,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -20632,9 +20632,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20643,9 +20643,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -20654,13 +20654,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -20669,9 +20669,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512DQ-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -21474,12 +21474,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 @@ -21515,20 +21513,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -21556,8 +21552,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21566,9 +21569,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -21579,9 +21579,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21590,9 +21590,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -21601,13 +21601,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -21616,9 +21616,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -22421,12 +22421,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512BW-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-NEXT: movb $-64, %r11b ; AVX512BW-NEXT: kmovd %r11d, %k1 @@ -22462,20 +22460,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -22503,8 +22499,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512BW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512BW-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22513,9 +22516,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -22526,9 +22526,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22537,9 +22537,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -22548,13 +22548,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -22563,9 +22563,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -23368,12 +23368,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512BW-FCP-NEXT: movb $-64, %r11b ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 @@ -23409,20 +23407,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -23450,8 +23446,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23460,9 +23463,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -23473,9 +23473,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23484,9 +23484,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -23495,13 +23495,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -23510,9 +23510,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -24315,12 +24315,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-NEXT: movb $-64, %r11b ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 @@ -24356,20 +24354,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -24397,8 +24393,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512DQ-BW-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24407,9 +24410,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -24420,9 +24420,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24431,9 +24431,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -24442,13 +24442,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -24457,9 +24457,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 @@ -25262,12 +25262,10 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm30 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm28 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm26 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r10), %zmm31 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm27 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %r11b ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 @@ -25303,20 +25301,18 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm11 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm23, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm26[0],zmm17[0],zmm26[2],zmm17[2],zmm26[4],zmm17[4],zmm26[6],zmm17[6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm16 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [7,15,7,15,7,15,7,15] ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm14 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm8 {%k1} = zmm26[1],zmm17[1],zmm26[3],zmm17[3],zmm26[5],zmm17[5],zmm26[7],zmm17[7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 @@ -25344,8 +25340,15 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm23, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r8), %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r9), %zmm22 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%r10), %zmm7 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rax), %zmm13 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm23, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25354,9 +25357,6 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm31[0],zmm27[0],zmm31[2],zmm27[2],zmm31[4],zmm27[4],zmm31[6],zmm27[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm5 {%k1} = zmm31[1],zmm27[1],zmm31[3],zmm27[3],zmm31[5],zmm27[5],zmm31[7],zmm27[7] @@ -25367,9 +25367,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm19, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm11[0],zmm22[0],zmm11[2],zmm22[2],zmm11[4],zmm22[4],zmm11[6],zmm22[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm6 = zmm9[0],zmm22[0],zmm9[2],zmm22[2],zmm9[4],zmm22[4],zmm9[6],zmm22[6] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25378,9 +25378,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm11[1],zmm22[1],zmm11[3],zmm22[3],zmm11[5],zmm22[5],zmm11[7],zmm22[7] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm6 = zmm9[1],zmm22[1],zmm9[3],zmm22[3],zmm9[5],zmm22[5],zmm9[7],zmm22[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm6 {%k1} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -25389,13 +25389,13 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm5 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm16[0],zmm13[0],zmm16[2],zmm13[2],zmm16[4],zmm13[4],zmm16[6],zmm13[6] +; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} zmm5 {%k1} = zmm7[0],zmm13[0],zmm7[2],zmm13[2],zmm7[4],zmm13[4],zmm7[6],zmm13[6] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 @@ -25404,9 +25404,9 @@ define void @store_i64_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rcx), %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm16[1],zmm13[1],zmm16[3],zmm13[3],zmm16[5],zmm13[5],zmm16[7],zmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm7[1],zmm13[1],zmm7[3],zmm13[3],zmm7[5],zmm13[5],zmm7[7],zmm13[7] ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index c1e99368e9201..c8f5ffbc13f2c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -1659,7 +1659,6 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm15 ; AVX-NEXT: vpor %xmm2, %xmm15, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vmovdqa 16(%rdx), %xmm1 ; AVX-NEXT: vpshufb %xmm10, %xmm6, %xmm6 ; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm15 ; AVX-NEXT: vpor %xmm6, %xmm15, %xmm0 @@ -1673,11 +1672,11 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX-NEXT: vpshufb %xmm11, %xmm15, %xmm10 ; AVX-NEXT: vpor %xmm7, %xmm10, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX-NEXT: vmovdqa 16(%rdx), %xmm0 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15] ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15] ; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm6 -; AVX-NEXT: vmovdqa %xmm1, %xmm0 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15] +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm12[8],xmm0[9],xmm12[9],xmm0[10],xmm12[10],xmm0[11],xmm12[11],xmm0[12],xmm12[12],xmm0[13],xmm12[13],xmm0[14],xmm12[14],xmm0[15],xmm12[15] ; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5 ; AVX-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm13[8],xmm11[9],xmm13[9],xmm11[10],xmm13[10],xmm11[11],xmm13[11],xmm11[12],xmm13[12],xmm11[13],xmm13[13],xmm11[14],xmm13[14],xmm11[15],xmm13[15] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 5e87572af5dc1..c86f0db930613 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -989,16 +989,14 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,1,2,3] ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,0,3] -; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4] +; SSE-NEXT: movdqa %xmm6, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: por %xmm1, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255] ; SSE-NEXT: pand %xmm2, %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2] -; SSE-NEXT: movdqa (%r8), %xmm0 ; SSE-NEXT: movdqa %xmm10, %xmm12 -; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] ; SSE-NEXT: pand %xmm1, %xmm7 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1] @@ -1012,10 +1010,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm11, %xmm10 ; SSE-NEXT: por %xmm5, %xmm10 +; SSE-NEXT: movdqa (%r8), %xmm5 +; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm7, %xmm10 -; SSE-NEXT: movdqa %xmm0, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[1,1,2,2] ; SSE-NEXT: movdqa %xmm7, %xmm0 ; SSE-NEXT: pandn %xmm11, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 @@ -1679,8 +1678,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2] ; SSE-NEXT: movdqa %xmm3, %xmm15 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pand %xmm3, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,0,3,4,5,6,7] @@ -1690,7 +1689,6 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm5 ; SSE-NEXT: movdqa %xmm8, %xmm0 ; SSE-NEXT: pandn %xmm5, %xmm0 -; SSE-NEXT: movdqa (%rcx), %xmm11 ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm0 @@ -1698,8 +1696,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm10, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm0, %xmm4 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7] +; SSE-NEXT: movdqa (%rcx), %xmm11 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,1,2,3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index fbeecbc0a4ab2..3ea3a45156b96 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1768,10 +1768,10 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm7 ; SSE-NEXT: por %xmm6, %xmm7 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,0,0] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255] +; SSE-NEXT: pand %xmm2, %xmm7 ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm6, %xmm1 ; SSE-NEXT: por %xmm7, %xmm1 @@ -2232,18 +2232,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: vmovdqa (%rcx), %ymm2 -; AVX2-NEXT: vmovdqa (%rdx), %ymm0 -; AVX2-NEXT: vmovdqa (%rsi), %ymm3 -; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] ; AVX2-NEXT: vmovdqa (%rcx), %xmm6 ; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm5 ; AVX2-NEXT: vmovdqa (%rdx), %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm9 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] -; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1] ; AVX2-NEXT: vmovdqa (%rsi), %xmm11 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -2260,12 +2255,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535] ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9 ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm12 +; AVX2-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-NEXT: vmovdqa (%rdx), %ymm0 +; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-NEXT: vmovdqa (%r8), %ymm4 ; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm7[0],ymm12[0],ymm7[1],ymm12[1],ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[4],ymm12[4],ymm7[5],ymm12[5],ymm7[6],ymm12[6],ymm7[7],ymm12[7],ymm7[16],ymm12[16],ymm7[17],ymm12[17],ymm7[18],ymm12[18],ymm7[19],ymm12[19],ymm7[20],ymm12[20],ymm7[21],ymm12[21],ymm7[22],ymm12[22],ymm7[23],ymm12[23] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm15 -; AVX2-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm7 +; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm7[0],ymm15[0],ymm7[1],ymm15[1],ymm7[2],ymm15[2],ymm7[3],ymm15[3],ymm7[4],ymm15[4],ymm7[5],ymm15[5],ymm7[6],ymm15[6],ymm7[7],ymm15[7],ymm7[16],ymm15[16],ymm7[17],ymm15[17],ymm7[18],ymm15[18],ymm7[19],ymm15[19],ymm7[20],ymm15[20],ymm7[21],ymm15[21],ymm7[22],ymm15[22],ymm7[23],ymm15[23] ; AVX2-NEXT: vmovdqa (%r9), %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3] @@ -2294,7 +2293,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1] ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535] ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23] +; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] ; AVX2-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15] ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13] @@ -2366,19 +2365,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP: # %bb.0: ; AVX2-FP-NEXT: subq $40, %rsp ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm7 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm8 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -2502,19 +2501,19 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP: # %bb.0: ; AVX2-FCP-NEXT: subq $40, %rsp ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 -; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm7 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1 -; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm8 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2 +; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6 +; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm3 +; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm4 +; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] @@ -5453,22 +5452,22 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm0 ; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] -; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm2[8],ymm11[9],ymm2[9],ymm11[10],ymm2[10],ymm11[11],ymm2[11],ymm11[12],ymm2[12],ymm11[13],ymm2[13],ymm11[14],ymm2[14],ymm11[15],ymm2[15],ymm11[24],ymm2[24],ymm11[25],ymm2[25],ymm11[26],ymm2[26],ymm11[27],ymm2[27],ymm11[28],ymm2[28],ymm11[29],ymm2[29],ymm11[30],ymm2[30],ymm11[31],ymm2[31] +; AVX512-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 @@ -5982,22 +5981,22 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm4 -; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm22 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm1 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm15 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-NEXT: vpshufb %ymm15, %ymm4, %ymm0 +; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm0 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm4[8],ymm11[9],ymm4[9],ymm11[10],ymm4[10],ymm11[11],ymm4[11],ymm11[12],ymm4[12],ymm11[13],ymm4[13],ymm11[14],ymm4[14],ymm11[15],ymm4[15],ymm11[24],ymm4[24],ymm11[25],ymm4[25],ymm11[26],ymm4[26],ymm11[27],ymm4[27],ymm11[28],ymm4[28],ymm11[29],ymm4[29],ymm11[30],ymm4[30],ymm11[31],ymm4[31] -; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19 +; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm11[8],ymm2[8],ymm11[9],ymm2[9],ymm11[10],ymm2[10],ymm11[11],ymm2[11],ymm11[12],ymm2[12],ymm11[13],ymm2[13],ymm11[14],ymm2[14],ymm11[15],ymm2[15],ymm11[24],ymm2[24],ymm11[25],ymm2[25],ymm11[26],ymm2[26],ymm11[27],ymm2[27],ymm11[28],ymm2[28],ymm11[29],ymm2[29],ymm11[30],ymm2[30],ymm11[31],ymm2[31] +; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm19 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31] ; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 14e5f65407942..df7013c2378e7 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -2684,29 +2684,29 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 16(%rsi), %xmm4 ; SSE-NEXT: movdqa 16(%rdx), %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,6,6,6] -; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r8), %xmm6 ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: movdqa 16(%r9), %xmm5 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: pand %xmm10, %xmm0 -; SSE-NEXT: movdqa 16(%rcx), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] +; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm13 -; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,5,5,7] -; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE-NEXT: movdqa %xmm10, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pandn %xmm2, %xmm0 +; SSE-NEXT: movdqa 16(%rcx), %xmm7 +; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa 16(%r8), %xmm6 +; SSE-NEXT: movdqa 16(%r9), %xmm5 +; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm4, %xmm13 +; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa %xmm1, %xmm11 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4340,7 +4340,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] ; AVX512-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 -; AVX512-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero @@ -4352,6 +4351,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] ; AVX512-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm12 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -4369,7 +4369,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8 -; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm12 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u] ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 @@ -4619,7 +4618,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm0[0,1,0,1,4,5,4,5] ; AVX512DQ-FCP-NEXT: vpternlogd $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm16, %zmm7 -; AVX512DQ-FCP-NEXT: vmovdqa64 (%r10), %ymm17 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm15, %zmm7 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm4[14,u,u],zero,zero,zero,zero,ymm4[15,u,u],zero,zero,zero,zero,ymm4[16,u,u],zero,zero,zero,zero,ymm4[17,u,u],zero,zero,zero,zero,ymm4[18] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm3[0,1,14],zero,ymm3[u,u,0,1,14,15],zero,ymm3[u,u,13,2,3,16],zero,ymm3[u,u,28,29,16,17],zero,ymm3[u,u,19,28,29,18],zero @@ -4631,6 +4629,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,14],zero,ymm5[u,u,u,u,u,15],zero,ymm5[u,u,u,u,u,16],zero,ymm5[u,u,u,u,u,17],zero,ymm5[u,u,u,u,u] ; AVX512DQ-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm12 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u] ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -4648,7 +4647,6 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm12 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,ymm12[13,u,u,u,u],zero,zero,ymm12[14,u,u,u,u],zero,zero,ymm12[15,u,u,u,u],zero,zero,ymm12[16,u,u,u,u],zero,zero,ymm12[17,u,u] ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm8 @@ -5329,10 +5327,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[3,3,3,3,4,5,6,7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] -; SSE-NEXT: pand %xmm6, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm11 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] +; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,0,3] ; SSE-NEXT: movdqa %xmm6, %xmm2 @@ -5928,11 +5926,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,2,1] ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,7,5,6,4] +; SSE-NEXT: movdqa %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm6, %xmm2 ; SSE-NEXT: pandn %xmm10, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm8[1,1,2,2,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,2,1] -; SSE-NEXT: pand %xmm6, %xmm10 +; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm10, %xmm2 ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm15, %xmm2 @@ -5941,9 +5940,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm0, %xmm15 ; SSE-NEXT: pandn %xmm10, %xmm15 +; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] -; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index 39b012bcf8d4e..a75b5f22d1605 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -3041,27 +3041,27 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm1, %ymm1 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5,5,7] -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7] ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13 -; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5],ymm13[6],ymm1[7],ymm13[8,9,10,11,12],ymm1[13],ymm13[14],ymm1[15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5] ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,6,6,7] ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5 +; AVX2-NEXT: vmovdqa 16(%rcx), %xmm8 +; AVX2-NEXT: vmovdqa 16(%rdx), %xmm15 +; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15] ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -3079,8 +3079,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-NEXT: vmovdqa %xmm9, %xmm5 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,3,2,3] ; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/vector-intrinsics.ll b/llvm/test/CodeGen/X86/vector-intrinsics.ll index 6441a83a4e326..aa01a42b86761 100644 --- a/llvm/test/CodeGen/X86/vector-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-intrinsics.ll @@ -239,15 +239,15 @@ define void @b(ptr %p, ptr %q) nounwind { ; CHECK-NEXT: movaps 32(%rdi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: movaps (%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 16(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps 48(%rdi), %xmm2 ; CHECK-NEXT: movaps 32(%rsi), %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps 48(%rdi), %xmm2 +; CHECK-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps 48(%rsi), %xmm1 ; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps %xmm2, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index d6171235aa2c4..d8c9102ecc659 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -686,12 +686,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm5[2,3] ; X86-AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] ; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; X86-AVX1-NEXT: vmovaps %ymm4, %ymm1 +; X86-AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm3[0],ymm6[2],ymm3[2] +; X86-AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm3[1],ymm6[3],ymm3[3] ; X86-AVX1-NEXT: movl %ebp, %esp ; X86-AVX1-NEXT: popl %ebp ; X86-AVX1-NEXT: retl @@ -717,9 +716,8 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X86-AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; X86-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; X86-AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] ; X86-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; X86-AVX2-NEXT: vmovaps %ymm5, %ymm0 +; X86-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7] ; X86-AVX2-NEXT: vmovaps %ymm4, %ymm1 ; X86-AVX2-NEXT: movl %ebp, %esp ; X86-AVX2-NEXT: popl %ebp @@ -740,13 +738,11 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] ; X64-AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] ; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; X64-AVX1-NEXT: vmovaps %ymm4, %ymm1 -; X64-AVX1-NEXT: vmovaps %ymm5, %ymm3 +; X64-AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] +; X64-AVX1-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: bit_reversal_permutation: @@ -754,20 +750,19 @@ define <16 x i64> @bit_reversal_permutation(<16 x i64> %a0) nounwind { ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] ; X64-AVX2-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,2,3] -; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] -; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5],ymm5[6,7] ; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3] +; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,3,3] ; X64-AVX2-NEXT: vmovlhps {{.*#+}} xmm7 = xmm2[0],xmm3[0] ; X64-AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] +; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; X64-AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] ; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] -; X64-AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; X64-AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] -; X64-AVX2-NEXT: vmovaps %ymm6, %ymm0 +; X64-AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] ; X64-AVX2-NEXT: vmovaps %ymm4, %ymm1 ; X64-AVX2-NEXT: retq %v0 = shufflevector <16 x i64> %a0, <16 x i64> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 90b6beeae516d..f4c2eb5f213d9 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -130,9 +130,8 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_32i8_to_32i16: @@ -587,9 +586,8 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_16i16_to_16i32: @@ -884,9 +882,8 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: zext_8i32_to_8i64: diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 35b90a4b2205f..f84131dfc8797 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -3220,8 +3220,8 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index bf330de825966..7dc6956120d3a 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1277,34 +1277,33 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){ ; AVX1-NEXT: vmovdqu (%rdi), %xmm11 ; AVX1-NEXT: vmovups 64(%rdi), %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 ; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5 -; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 -; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 ; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2 -; AVX1-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10 ; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14] -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 ; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6 ; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7 -; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 -; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] ; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] ; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12 -; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10 ; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12 ; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13 +; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4 +; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8 +; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9 +; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11 -; AVX1-NEXT: vmovdqa %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12 +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpshufb %xmm15, %xmm0, %xmm12 ; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1 ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11 diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll index 150385ffd8aa8..a2bc5c6eeeb79 100644 --- a/llvm/test/CodeGen/X86/xmulo.ll +++ b/llvm/test/CodeGen/X86/xmulo.ll @@ -210,14 +210,14 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: movl %edx, %ebx ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebp ; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax @@ -973,14 +973,14 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %edi, %esi ; WIN32-NEXT: movl %edx, %ebp ; WIN32-NEXT: sarl $31, %ecx -; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: mull %ecx +; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi +; WIN32-NEXT: movl %edi, %esi +; WIN32-NEXT: imull %ecx, %esi ; WIN32-NEXT: movl %edx, %ecx -; WIN32-NEXT: movl %eax, %ebx ; WIN32-NEXT: addl %eax, %ecx ; WIN32-NEXT: addl %esi, %ecx ; WIN32-NEXT: movl %edi, %eax From 45e229df9fb26242e2ae0236f1c446ba2ea1bbfe Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 22 Aug 2024 21:44:40 +0200 Subject: [PATCH 05/15] Fix safety tests --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 750eaf2de4857..3c59d3fcadaec 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -125,12 +125,12 @@ static std::optional> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { MachineInstr *DstInstr = Dst->getInstr(); MachineInstr *SrcInstr = Src->getInstr(); - MachineBasicBlock *MBB = SrcInstr->getParent(); - if (DstInstr == nullptr || SrcInstr == nullptr) return {}; + + MachineBasicBlock *MBB = SrcInstr->getParent(); assert("This function only operates on a basic block level." && - MBB == SrcInstr->getParent()); + MBB == DstInstr->getParent()); int SectionSize = std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); From 793dd6b63534fa7dd0519d255dcc4b0fc8e92627 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 22 Aug 2024 22:07:02 +0200 Subject: [PATCH 06/15] Make sure that copy src precedes copy dst --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 3c59d3fcadaec..8143398d550b5 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -132,7 +132,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { assert("This function only operates on a basic block level." && MBB == DstInstr->getParent()); - int SectionSize = + assert(std::distance(SrcInstr->getIterator(), DstInstr->getIterator()) > 0 && + "The copy source must precede the copy destination."); + unsigned SectionSize = std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); // The bit vector representing the instructions in the section. From 5420aa6005d7401188fa51e591dc20f6c138a2c3 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 22 Aug 2024 22:21:08 +0200 Subject: [PATCH 07/15] Keep using int for the section size. It is compared with possibly negative int and also good for the assertion --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 8143398d550b5..ccf7e9d39003b 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -132,11 +132,13 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { assert("This function only operates on a basic block level." && MBB == DstInstr->getParent()); - assert(std::distance(SrcInstr->getIterator(), DstInstr->getIterator()) > 0 && - "The copy source must precede the copy destination."); - unsigned SectionSize = + + int SectionSize = std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); + assert(SectionSize > 0 && + "The copy source must precede the copy destination."); + // The bit vector representing the instructions in the section. // This vector stores which instruction needs to be moved and which does not. BitVector SectionInstr(SectionSize, false); From ca2e3a93a77f6fc572420b6ee3ea561a94b7dc87 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 22 Aug 2024 22:28:59 +0200 Subject: [PATCH 08/15] Capture the Queue instead of passing it as an argument --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index ccf7e9d39003b..b53e4b12467c7 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -152,8 +152,7 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { // (only if we are not talking about the destination node which is a special // case indicated by a flag) and is located between the source of the copy and // the destination of the copy. - auto ProcessSNodeChildren = [SrcInstr, &SectionSize, &SectionInstr]( - std::queue &Queue, + auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr]( const SUnit *Node, bool IsRoot) -> bool { for (llvm::SDep I : Node->Preds) { SUnit *SU = I.getSUnit(); @@ -171,7 +170,7 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { // dependence. We do not need to do anything with it again. if (!SectionInstr[DestinationFromSource]) { SectionInstr[DestinationFromSource] = true; - Queue.push(SU); + Edges.push(SU); } } } @@ -197,11 +196,11 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { // source. To decide if we have it as dependency of another instruction, we // must check in the already traversed list if any of the instructions that is // depended on the source is contained. This would introduce extra costs. - ProcessSNodeChildren(Edges, Dst, true); + ProcessSNodeChildren(Dst, true); while (!Edges.empty()) { const auto *Current = Edges.front(); Edges.pop(); - if (!ProcessSNodeChildren(Edges, Current, false)) + if (!ProcessSNodeChildren(Current, false)) return {}; } From b9908c44b74d7e2f077b4fbf51c75b627706b416 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 22 Aug 2024 23:18:50 +0200 Subject: [PATCH 09/15] One construction for the SchedulerDAG --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 47 ++++++++++----------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index b53e4b12467c7..95160c31320e4 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -66,6 +66,7 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -586,11 +587,11 @@ class MachineCopyPropagation : public MachineFunctionPass { void ReadRegister(MCRegister Reg, MachineInstr &Reader, DebugType DT); void readSuccessorLiveIns(const MachineBasicBlock &MBB); void ForwardCopyPropagateBlock(MachineBasicBlock &MBB); - void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, bool ResolveAntiDeps = false); + void BackwardCopyPropagateBlock(MachineBasicBlock &MBB, ScheduleDAGMCP *DG = nullptr); void EliminateSpillageCopies(MachineBasicBlock &MBB); bool eraseIfRedundant(MachineInstr &Copy, MCRegister Src, MCRegister Def); void forwardUses(MachineInstr &MI); - void propagateDefs(MachineInstr &MI, ScheduleDAGMCP &DG, bool ResolveAntiDeps = false); + void propagateDefs(MachineInstr &MI, ScheduleDAGMCP *DG = nullptr); bool isForwardableRegClassCopy(const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx); bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy, @@ -1158,9 +1159,8 @@ static bool isBackwardPropagatableCopy(const DestSourcePair &CopyOperands, return CopyOperands.Source->isRenamable() && CopyOperands.Source->isKill(); } -void MachineCopyPropagation::propagateDefs( - MachineInstr &MI, ScheduleDAGMCP &DG, - bool MoveDependenciesForBetterCopyPropagation) { +void MachineCopyPropagation::propagateDefs(MachineInstr &MI, + ScheduleDAGMCP *DG) { if (!Tracker.hasAnyCopies() && !Tracker.hasAnyInvalidCopies()) return; @@ -1186,7 +1186,7 @@ void MachineCopyPropagation::propagateDefs( MachineInstr *Copy = Tracker.findAvailBackwardCopy( MI, MODef.getReg().asMCReg(), *TRI, *TII, UseCopyInstr); if (!Copy) { - if (!MoveDependenciesForBetterCopyPropagation) + if (!DG) continue; LLVM_DEBUG( @@ -1203,8 +1203,8 @@ void MachineCopyPropagation::propagateDefs( LLVM_DEBUG( dbgs() << "MCP: Found potential backward copy that has dependency.\n"); - SUnit *DstSUnit = DG.getSUnit(Copy); - SUnit *SrcSUnit = DG.getSUnit(&MI); + SUnit *DstSUnit = DG->getSUnit(Copy); + SUnit *SrcSUnit = DG->getSUnit(&MI); InstructionsToMove = moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit); @@ -1232,7 +1232,7 @@ void MachineCopyPropagation::propagateDefs( LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); - if (!MoveDependenciesForBetterCopyPropagation) { + if (!DG) { MODef.setReg(Def); MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); @@ -1249,12 +1249,11 @@ void MachineCopyPropagation::propagateDefs( } void MachineCopyPropagation::BackwardCopyPropagateBlock( - MachineBasicBlock &MBB, bool MoveDependenciesForBetterCopyPropagation) { - ScheduleDAGMCP DG{*(MBB.getParent()), nullptr, false}; - if (MoveDependenciesForBetterCopyPropagation) { - DG.startBlock(&MBB); - DG.enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size()); - DG.buildSchedGraph(nullptr); + MachineBasicBlock &MBB, ScheduleDAGMCP *DG) { + if (DG) { + DG->startBlock(&MBB); + DG->enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size()); + DG->buildSchedGraph(nullptr); // DG.viewGraph(); } @@ -1277,7 +1276,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( // just let forward cp do COPY-to-COPY propagation. if (isBackwardPropagatableCopy(*CopyOperands, *MRI)) { Tracker.invalidateRegister(SrcReg.asMCReg(), *TRI, *TII, UseCopyInstr, - MoveDependenciesForBetterCopyPropagation); + DG); Tracker.invalidateRegister(DefReg.asMCReg(), *TRI, *TII, UseCopyInstr); Tracker.trackCopy(&MI, *TRI, *TII, UseCopyInstr); @@ -1295,7 +1294,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Tracker.invalidateRegister(Reg, *TRI, *TII, UseCopyInstr, false); } - propagateDefs(MI, DG, MoveDependenciesForBetterCopyPropagation); + propagateDefs(MI, DG); for (const MachineOperand &MO : MI.operands()) { if (!MO.isReg()) continue; @@ -1320,7 +1319,7 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( } else { Tracker.invalidateRegister(MO.getReg().asMCReg(), *TRI, *TII, UseCopyInstr, - MoveDependenciesForBetterCopyPropagation); + DG); } } } @@ -1338,13 +1337,13 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Copy->eraseFromParent(); ++NumDeletes; } - if (MoveDependenciesForBetterCopyPropagation) { - DG.exitRegion(); - DG.finishBlock(); + if (DG) { + DG->exitRegion(); + DG->finishBlock(); // QUESTION: Does it makes sense to keep the kill flags here? // On the other parts of this pass we juts throw out // the kill flags. - DG.fixupKills(MBB); + DG->fixupKills(MBB); } @@ -1699,7 +1698,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { MRI = &MF.getRegInfo(); auto *LISWrapper = getAnalysisIfAvailable(); LIS = LISWrapper ? &LISWrapper->getLIS() : nullptr; - + ScheduleDAGMCP DG{MF, nullptr, false}; for (MachineBasicBlock &MBB : MF) { if (isSpillageCopyElimEnabled) EliminateSpillageCopies(MBB); @@ -1716,7 +1715,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { // The renaming wouldn't happen instantly. There would be a data structure // that contained what register should be renamed to what. Then after the // backward propagation has concluded the renaming would happen. - BackwardCopyPropagateBlock(MBB, true); + BackwardCopyPropagateBlock(MBB, &DG); // Then we do the actual copy propagation. BackwardCopyPropagateBlock(MBB); From 5561de96db79a30fbe14bc58c46e45957c790124 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Tue, 27 Aug 2024 12:56:45 +0200 Subject: [PATCH 10/15] Use a limit for the search --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 95160c31320e4..2780e569426f9 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -124,6 +124,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs { static std::optional> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { + unsigned MaxNumberOfNodesToBeProcessed = 25; MachineInstr *DstInstr = Dst->getInstr(); MachineInstr *SrcInstr = Src->getInstr(); if (DstInstr == nullptr || SrcInstr == nullptr) @@ -147,13 +148,15 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { // The queue for the breadth first search. std::queue Edges; + unsigned NumProcessedNode = 0; + // Process the children of a node. // Basically every node are checked before it is being put into the queue. // A node is enqueued if it has no dependencies on the source of the copy // (only if we are not talking about the destination node which is a special // case indicated by a flag) and is located between the source of the copy and // the destination of the copy. - auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr]( + auto ProcessSNodeChildren = [&Edges, SrcInstr, &SectionSize, &SectionInstr, &NumProcessedNode, &MaxNumberOfNodesToBeProcessed]( const SUnit *Node, bool IsRoot) -> bool { for (llvm::SDep I : Node->Preds) { SUnit *SU = I.getSUnit(); @@ -174,8 +177,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { Edges.push(SU); } } + NumProcessedNode++; } - return true; + return NumProcessedNode < MaxNumberOfNodesToBeProcessed; }; // The BFS happens here. From a5a2766a85703c60cd2c062581f6d9b65b4883f2 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Tue, 27 Aug 2024 20:42:17 +0200 Subject: [PATCH 11/15] Decrease the node limit to 10 --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 2780e569426f9..62452b9211e30 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -124,7 +124,7 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs { static std::optional> moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { - unsigned MaxNumberOfNodesToBeProcessed = 25; + unsigned MaxNumberOfNodesToBeProcessed = 10; MachineInstr *DstInstr = Dst->getInstr(); MachineInstr *SrcInstr = Src->getInstr(); if (DstInstr == nullptr || SrcInstr == nullptr) @@ -175,9 +175,9 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { if (!SectionInstr[DestinationFromSource]) { SectionInstr[DestinationFromSource] = true; Edges.push(SU); + NumProcessedNode++; } } - NumProcessedNode++; } return NumProcessedNode < MaxNumberOfNodesToBeProcessed; }; From 9d87620f210d832398a9cd8f039689d1007486b9 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Thu, 29 Aug 2024 18:59:39 +0200 Subject: [PATCH 12/15] Just build the graph for copy regions --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 31 +++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 62452b9211e30..fbe5e35aa4874 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -123,20 +123,24 @@ class ScheduleDAGMCP : public ScheduleDAGInstrs { }; static std::optional> -moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { +moveInstructionsOutOfTheWayIfWeCan(MachineInstr *DstInstr, MachineInstr *SrcInstr, ScheduleDAGMCP &DG) { + SUnit *Dst; + //SUnit *Src; + + MachineBasicBlock *MBB = SrcInstr->getParent(); + int SectionSize = + std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); + + DG.enterRegion(MBB, (SrcInstr->getIterator()), ++(DstInstr->getIterator()), SectionSize+1); + DG.buildSchedGraph(nullptr); + Dst = DG.getSUnit(DstInstr); unsigned MaxNumberOfNodesToBeProcessed = 10; - MachineInstr *DstInstr = Dst->getInstr(); - MachineInstr *SrcInstr = Src->getInstr(); if (DstInstr == nullptr || SrcInstr == nullptr) return {}; - MachineBasicBlock *MBB = SrcInstr->getParent(); assert("This function only operates on a basic block level." && MBB == DstInstr->getParent()); - - int SectionSize = - std::distance(SrcInstr->getIterator(), DstInstr->getIterator()); assert(SectionSize > 0 && "The copy source must precede the copy destination."); @@ -205,10 +209,14 @@ moveInstructionsOutOfTheWayIfWeCan(SUnit *Dst, SUnit *Src) { while (!Edges.empty()) { const auto *Current = Edges.front(); Edges.pop(); - if (!ProcessSNodeChildren(Current, false)) + if (!ProcessSNodeChildren(Current, false)) { + DG.exitRegion(); return {}; + } } + DG.exitRegion(); + // If all of the dependencies were deemed valid during the BFS then we // are moving them before the copy source here keeping their relative // order to each other. @@ -1207,11 +1215,9 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, LLVM_DEBUG( dbgs() << "MCP: Found potential backward copy that has dependency.\n"); - SUnit *DstSUnit = DG->getSUnit(Copy); - SUnit *SrcSUnit = DG->getSUnit(&MI); InstructionsToMove = - moveInstructionsOutOfTheWayIfWeCan(DstSUnit, SrcSUnit); + moveInstructionsOutOfTheWayIfWeCan(Copy, &MI, *DG); if (!InstructionsToMove) continue; } @@ -1256,8 +1262,6 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( MachineBasicBlock &MBB, ScheduleDAGMCP *DG) { if (DG) { DG->startBlock(&MBB); - DG->enterRegion(&MBB, MBB.begin(), MBB.end(), MBB.size()); - DG->buildSchedGraph(nullptr); // DG.viewGraph(); } @@ -1342,7 +1346,6 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( ++NumDeletes; } if (DG) { - DG->exitRegion(); DG->finishBlock(); // QUESTION: Does it makes sense to keep the kill flags here? // On the other parts of this pass we juts throw out From a34e68d6390cf23a88315e3b3527748f7724639c Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Fri, 23 Aug 2024 12:48:32 +0200 Subject: [PATCH 13/15] Fix a typo --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index fbe5e35aa4874..c1fe3f196469f 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -201,7 +201,7 @@ moveInstructionsOutOfTheWayIfWeCan(MachineInstr *DstInstr, MachineInstr *SrcInst // this scenario this must be ignored. Let's say that we can not control what // nodes to process and we come across the copy source. How do I know what // node has that copy source as their dependency? We can check of which node - // is the copy source the dependency of. This list will alway contain the + // is the copy source the dependency of. This list will always contain the // source. To decide if we have it as dependency of another instruction, we // must check in the already traversed list if any of the instructions that is // depended on the source is contained. This would introduce extra costs. From f6ca9f674ac27cc31a67a47d2bcf13e4880a67b7 Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Fri, 30 Aug 2024 13:53:04 +0200 Subject: [PATCH 14/15] Prot back to one stage --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 62 +++++++------------ .../CodeGen/AArch64/anti-dependencies-mcp.mir | 2 +- 2 files changed, 23 insertions(+), 41 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index c1fe3f196469f..7618bfb83de92 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -945,7 +945,6 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) { MOUse.setIsUndef(CopySrc.isUndef()); LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - // Clear kill markers that may have been invalidated. for (MachineInstr &KMI : make_range(Copy->getIterator(), std::next(MI.getIterator()))) @@ -1238,32 +1237,31 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, if (hasOverlappingMultipleDef(MI, MODef, Def)) continue; - + if (InstructionsToMove) + for (auto *I : *InstructionsToMove) { + MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator()); + } LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); - if (!DG) { - MODef.setReg(Def); - MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); - LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - MaybeDeadCopies.insert(Copy); - Changed = true; - ++NumCopyBackwardPropagated; - } else if (InstructionsToMove) { - for (auto *I : *InstructionsToMove) { - MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator()); - } - } + + MODef.setReg(Def); + MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); + + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); + MaybeDeadCopies.insert(Copy); + Changed = true; + ++NumCopyBackwardPropagated; + + } } void MachineCopyPropagation::BackwardCopyPropagateBlock( MachineBasicBlock &MBB, ScheduleDAGMCP *DG) { - if (DG) { - DG->startBlock(&MBB); - // DG.viewGraph(); - } + DG->startBlock(&MBB); + // DG.viewGraph(); LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName() @@ -1345,13 +1343,12 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Copy->eraseFromParent(); ++NumDeletes; } - if (DG) { - DG->finishBlock(); - // QUESTION: Does it makes sense to keep the kill flags here? - // On the other parts of this pass we juts throw out - // the kill flags. - DG->fixupKills(MBB); - } + + DG->finishBlock(); + // QUESTION: Does it makes sense to keep the kill flags here? + // On the other parts of this pass we juts throw out + // the kill flags. + DG->fixupKills(MBB); MaybeDeadCopies.clear(); @@ -1709,22 +1706,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { if (isSpillageCopyElimEnabled) EliminateSpillageCopies(MBB); - - // BackwardCopyPropagateBlock happens in two stages. - // First we move those unnecessary dependencies out of the way - // that may block copy propagations. - // - // The reason for this two stage approach is that the ScheduleDAG can not - // handle register renaming. - // QUESTION: I think these two stages could be merged together, if I were to change - // the renaming mechanism. - // - // The renaming wouldn't happen instantly. There would be a data structure - // that contained what register should be renamed to what. Then after the - // backward propagation has concluded the renaming would happen. BackwardCopyPropagateBlock(MBB, &DG); - // Then we do the actual copy propagation. - BackwardCopyPropagateBlock(MBB); ForwardCopyPropagateBlock(MBB); } diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir index c3a59990ccd25..abf668533fd01 100644 --- a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir +++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir @@ -177,9 +177,9 @@ body: | ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1 ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp - ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) ; CHECK-NEXT: renamable $w1 = COPY $w0 ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0 + ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp From 027d7761dbb0293451a2c00e32cd6dc5ce83252c Mon Sep 17 00:00:00 2001 From: Gabor Spaits Date: Fri, 30 Aug 2024 17:28:39 +0200 Subject: [PATCH 15/15] Revert "Prot back to one stage" This reverts commit f6ca9f674ac27cc31a67a47d2bcf13e4880a67b7. --- llvm/lib/CodeGen/MachineCopyPropagation.cpp | 62 ++++++++++++------- .../CodeGen/AArch64/anti-dependencies-mcp.mir | 2 +- 2 files changed, 41 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 7618bfb83de92..c1fe3f196469f 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -945,6 +945,7 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) { MOUse.setIsUndef(CopySrc.isUndef()); LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); + // Clear kill markers that may have been invalidated. for (MachineInstr &KMI : make_range(Copy->getIterator(), std::next(MI.getIterator()))) @@ -1237,31 +1238,32 @@ void MachineCopyPropagation::propagateDefs(MachineInstr &MI, if (hasOverlappingMultipleDef(MI, MODef, Def)) continue; - if (InstructionsToMove) - for (auto *I : *InstructionsToMove) { - MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator()); - } + LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI) << "\n with " << printReg(Def, TRI) << "\n in " << MI << " from " << *Copy); + if (!DG) { + MODef.setReg(Def); + MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); - - MODef.setReg(Def); - MODef.setIsRenamable(CopyOperands->Destination->isRenamable()); - - LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); - MaybeDeadCopies.insert(Copy); - Changed = true; - ++NumCopyBackwardPropagated; - - + LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n"); + MaybeDeadCopies.insert(Copy); + Changed = true; + ++NumCopyBackwardPropagated; + } else if (InstructionsToMove) { + for (auto *I : *InstructionsToMove) { + MI.getParent()->splice(MI.getIterator(), MI.getParent(), I->getIterator()); + } + } } } void MachineCopyPropagation::BackwardCopyPropagateBlock( MachineBasicBlock &MBB, ScheduleDAGMCP *DG) { - DG->startBlock(&MBB); - // DG.viewGraph(); + if (DG) { + DG->startBlock(&MBB); + // DG.viewGraph(); + } LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName() @@ -1343,12 +1345,13 @@ void MachineCopyPropagation::BackwardCopyPropagateBlock( Copy->eraseFromParent(); ++NumDeletes; } - - DG->finishBlock(); - // QUESTION: Does it makes sense to keep the kill flags here? - // On the other parts of this pass we juts throw out - // the kill flags. - DG->fixupKills(MBB); + if (DG) { + DG->finishBlock(); + // QUESTION: Does it makes sense to keep the kill flags here? + // On the other parts of this pass we juts throw out + // the kill flags. + DG->fixupKills(MBB); + } MaybeDeadCopies.clear(); @@ -1706,7 +1709,22 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { if (isSpillageCopyElimEnabled) EliminateSpillageCopies(MBB); + + // BackwardCopyPropagateBlock happens in two stages. + // First we move those unnecessary dependencies out of the way + // that may block copy propagations. + // + // The reason for this two stage approach is that the ScheduleDAG can not + // handle register renaming. + // QUESTION: I think these two stages could be merged together, if I were to change + // the renaming mechanism. + // + // The renaming wouldn't happen instantly. There would be a data structure + // that contained what register should be renamed to what. Then after the + // backward propagation has concluded the renaming would happen. BackwardCopyPropagateBlock(MBB, &DG); + // Then we do the actual copy propagation. + BackwardCopyPropagateBlock(MBB); ForwardCopyPropagateBlock(MBB); } diff --git a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir index abf668533fd01..c3a59990ccd25 100644 --- a/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir +++ b/llvm/test/CodeGen/AArch64/anti-dependencies-mcp.mir @@ -177,9 +177,9 @@ body: | ; CHECK-NEXT: $w1 = KILL killed renamable $w1, implicit $x1 ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def $w0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp + ; CHECK-NEXT: renamable $w8 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) ; CHECK-NEXT: renamable $w1 = COPY $w0 ; CHECK-NEXT: $w0 = ADDWrr killed $w0, $w0 - ; CHECK-NEXT: $w0 = LDRWui %stack.0.c, 0 :: (dereferenceable load (s32) from %ir.c) ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: BL @chain, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit killed $w0, implicit killed $w1, implicit-def $sp, implicit-def dead $w0 ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp