diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index ba46ededc63a8..87e057a468afd 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -2931,16 +2931,6 @@ struct RegPairInfo { } // end anonymous namespace -unsigned findFreePredicateReg(BitVector &SavedRegs) { - for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) { - if (SavedRegs.test(PReg)) { - unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0; - return PNReg; - } - } - return AArch64::NoRegister; -} - static void computeCalleeSaveRegisterPairs( MachineFunction &MF, ArrayRef CSI, const TargetRegisterInfo *TRI, SmallVectorImpl &RegPairs, @@ -3645,7 +3635,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, unsigned ExtraCSSpill = 0; bool HasUnpairedGPR64 = false; - bool HasPairZReg = false; // Figure out which callee-saved registers to save/restore. for (unsigned i = 0; CSRegs[i]; ++i) { const unsigned Reg = CSRegs[i]; @@ -3699,28 +3688,6 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, !RegInfo->isReservedReg(MF, PairedReg)) ExtraCSSpill = PairedReg; } - // Check if there is a pair of ZRegs, so it can select PReg for spill/fill - HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) && - SavedRegs.test(CSRegs[i ^ 1])); - } - - if (HasPairZReg && (Subtarget.hasSVE2p1() || Subtarget.hasSME2())) { - AArch64FunctionInfo *AFI = MF.getInfo(); - // Find a suitable predicate register for the multi-vector spill/fill - // instructions. - unsigned PnReg = findFreePredicateReg(SavedRegs); - if (PnReg != AArch64::NoRegister) - AFI->setPredicateRegForFillSpill(PnReg); - // If no free callee-save has been found assign one. - if (!AFI->getPredicateRegForFillSpill() && - MF.getFunction().getCallingConv() == - CallingConv::AArch64_SVE_VectorCall) { - SavedRegs.set(AArch64::P8); - AFI->setPredicateRegForFillSpill(AArch64::PN8); - } - - assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) && - "Predicate cannot be a reserved register"); } if (MF.getFunction().getCallingConv() == CallingConv::Win64 && diff --git a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll index 6264ce0cf4ae6..fa8f92cb0a2c9 100644 --- a/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll +++ b/llvm/test/CodeGen/AArch64/sme-vg-to-stack.ll @@ -329,27 +329,34 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: .cfi_offset w29, -32 ; CHECK-NEXT: addvl sp, sp, #-18 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG -; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue pn8.b ; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill ; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 32 - 8 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 32 - 16 * VG ; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x60, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 32 - 24 * VG @@ -371,16 +378,23 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; CHECK-NEXT: .cfi_restore vg ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 144 * VG -; CHECK-NEXT: ptrue pn8.b +; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -424,27 +438,34 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: .cfi_offset w30, -40 ; FP-CHECK-NEXT: .cfi_offset w29, -48 ; FP-CHECK-NEXT: addvl sp, sp, #-18 -; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: ptrue pn8.b ; FP-CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; FP-CHECK-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill ; FP-CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; FP-CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; FP-CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; FP-CHECK-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; FP-CHECK-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; FP-CHECK-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 48 - 8 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 48 - 16 * VG ; FP-CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x50, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 48 - 24 * VG @@ -464,16 +485,23 @@ define void @vg_unwind_with_sve_args( %x) #0 { ; FP-CHECK-NEXT: smstart sm ; FP-CHECK-NEXT: .cfi_restore vg ; FP-CHECK-NEXT: addvl sp, sp, #1 -; FP-CHECK-NEXT: ptrue pn8.b +; FP-CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; FP-CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload ; FP-CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; FP-CHECK-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; FP-CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; FP-CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll index 29d3d68fc4c3d..013d8a0512b15 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -55,31 +55,45 @@ define @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i8_z0_z8_scalar( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8( %unused, @ld1_x2_i16_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -477,14 +569,20 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -497,15 +595,21 @@ define @ld1_x2_i16_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -573,31 +677,45 @@ define @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8( %unused, @ld1_x2_i32_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -736,14 +880,20 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -756,15 +906,21 @@ define @ld1_x2_i32_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -832,31 +988,45 @@ define @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8( %unused, @ld1_x2_i64_z0_z8_scalar( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -995,14 +1191,20 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1015,15 +1217,21 @@ define @ld1_x2_i64_z0_z8_scalar( %unused, < ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1093,32 +1301,46 @@ define @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12( %unused, @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1270,14 +1516,19 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1294,15 +1545,20 @@ define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unu ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1376,32 +1632,46 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1412,14 +1682,19 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1436,15 +1711,20 @@ define @ld1_x4_i16_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1518,32 +1798,46 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1554,14 +1848,19 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1578,15 +1877,20 @@ define @ld1_x4_i16_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1660,32 +1964,46 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1696,14 +2014,19 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1720,15 +2043,20 @@ define @ld1_x4_i32_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1802,32 +2130,46 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1838,14 +2180,19 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1862,15 +2209,20 @@ define @ld1_x4_i32_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1944,32 +2296,46 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1980,14 +2346,19 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2004,15 +2375,20 @@ define @ld1_x4_i64_z0_z4_z8_z12( %unused, < ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2086,32 +2462,46 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -2122,14 +2512,19 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -2146,15 +2541,20 @@ define @ld1_x4_i64_z0_z4_z8_z12_scalar( %un ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll index 3d3748e101122..eff1260c947dd 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -8,31 +8,45 @@ define @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8( %unused, @ldnt1_x2_i8_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -124,14 +164,20 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -144,15 +190,21 @@ define @ldnt1_x2_i8_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -173,31 +225,45 @@ define @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8( %unused, @ldnt1_x2_i16_z0_z8_scalar( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -289,14 +381,20 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -309,15 +407,21 @@ define @ldnt1_x2_i16_z0_z8_scalar( %unused ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -338,31 +442,45 @@ define @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8( %unused, @ldnt1_x2_i32_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -454,14 +598,20 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -474,15 +624,21 @@ define @ldnt1_x2_i32_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -503,31 +659,45 @@ define @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8( %unused, @ldnt1_x2_i64_z0_z8_scalar( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: mov z1.d, z8.d -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload ; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; STRIDED-NEXT: ret @@ -619,14 +815,20 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-16 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-2 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -639,15 +841,21 @@ define @ldnt1_x2_i64_z0_z8_scalar( %unused, ; CONTIGUOUS-NEXT: ldr z0, [sp] ; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #2 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #16 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -668,32 +876,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -704,14 +926,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -728,15 +955,20 @@ define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -760,32 +992,46 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -796,14 +1042,19 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -820,15 +1071,20 @@ define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %u ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -853,32 +1109,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -889,14 +1159,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -913,15 +1188,20 @@ define @ldnt1_x4_i16_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -945,32 +1225,46 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -981,14 +1275,19 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1005,15 +1304,20 @@ define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1038,32 +1342,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1074,14 +1392,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1098,15 +1421,20 @@ define @ldnt1_x4_i32_z0_z4_z8_z12( %unused ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1130,32 +1458,46 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1166,14 +1508,19 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1190,15 +1537,20 @@ define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1223,32 +1575,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1259,14 +1625,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1283,15 +1654,20 @@ define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1315,32 +1691,46 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; STRIDED-NEXT: addvl sp, sp, #-17 ; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #22, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #26, mul vl] // 32-byte Folded Spill -; STRIDED-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #30, mul vl] // 32-byte Folded Spill ; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill ; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] ; STRIDED-NEXT: //APP ; STRIDED-NEXT: nop ; STRIDED-NEXT: //NO_APP -; STRIDED-NEXT: ptrue pn8.b -; STRIDED-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #26, mul vl] // 32-byte Folded Reload +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z2.d, z8.d ; STRIDED-NEXT: mov z3.d, z12.d -; STRIDED-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #22, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #30, mul vl] // 32-byte Folded Reload -; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload ; STRIDED-NEXT: mov z1.d, z4.d ; STRIDED-NEXT: addvl sp, sp, #17 ; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1351,14 +1741,19 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-15 ; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CONTIGUOUS-NEXT: ptrue pn8.b -; CONTIGUOUS-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #6, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #10, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #14, mul vl] // 32-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #18, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill -; CONTIGUOUS-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill ; CONTIGUOUS-NEXT: addvl sp, sp, #-4 ; CONTIGUOUS-NEXT: mov p8.b, p0.b @@ -1375,15 +1770,20 @@ define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( % ; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] ; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] ; CONTIGUOUS-NEXT: addvl sp, sp, #4 -; CONTIGUOUS-NEXT: ptrue pn8.b +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload ; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #6, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #10, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #14, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #18, mul vl] // 32-byte Folded Reload -; CONTIGUOUS-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload ; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CONTIGUOUS-NEXT: addvl sp, sp, #15 ; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll index 470c0dd45782c..c0d5d9dfdbb0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll +++ b/llvm/test/CodeGen/AArch64/sve-callee-save-restore-pairs.ll @@ -88,27 +88,34 @@ define void @fbyte( %v) { ; PAIR: // %bb.0: ; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-18 -; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -121,16 +128,23 @@ define void @fbyte( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func -; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -230,27 +244,34 @@ define void @fhalf( %v) { ; PAIR: // %bb.0: ; PAIR-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-18 -; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z22.b, z23.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z20.b, z21.b }, pn8, [sp, #8, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z18.b, z19.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z16.b, z17.b }, pn8, [sp, #16, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z14.b, z15.b }, pn8, [sp, #20, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: st1b { z12.b, z13.b }, pn8, [sp, #24, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z10.b, z11.b }, pn8, [sp, #28, mul vl] // 32-byte Folded Spill ; PAIR-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #32, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG ; PAIR-NEXT: .cfi_offset w30, -8 ; PAIR-NEXT: .cfi_offset w29, -16 @@ -263,16 +284,23 @@ define void @fhalf( %v) { ; PAIR-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG ; PAIR-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG ; PAIR-NEXT: bl my_func -; PAIR-NEXT: ptrue pn8.b +; PAIR-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z22.b, z23.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z20.b, z21.b }, pn8/z, [sp, #8, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z18.b, z19.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z16.b, z17.b }, pn8/z, [sp, #16, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z14.b, z15.b }, pn8/z, [sp, #20, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z12.b, z13.b }, pn8/z, [sp, #24, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z10.b, z11.b }, pn8/z, [sp, #28, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #32, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload @@ -323,12 +351,11 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR: // %bb.0: ; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-4 -; PAIR-NEXT: str p8, [sp, #5, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -336,10 +363,9 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs() { ; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ; PAIR-NEXT: //APP ; PAIR-NEXT: //NO_APP -; PAIR-NEXT: ptrue pn8.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #4, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ldr p8, [sp, #5, mul vl] // 2-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #4 @@ -381,11 +407,11 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR: // %bb.0: ; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; PAIR-NEXT: addvl sp, sp, #-4 -; PAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: str p10, [sp, #6, mul vl] // 2-byte Folded Spill +; PAIR-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill ; PAIR-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill -; PAIR-NEXT: st1b { z8.b, z9.b }, pn9, [sp, #4, mul vl] // 32-byte Folded Spill +; PAIR-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill ; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG @@ -393,10 +419,10 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_p_regs2() { ; PAIR-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG ; PAIR-NEXT: //APP ; PAIR-NEXT: //NO_APP -; PAIR-NEXT: ptrue pn9.b ; PAIR-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload ; PAIR-NEXT: ldr p10, [sp, #6, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn9/z, [sp, #4, mul vl] // 32-byte Folded Reload ; PAIR-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload ; PAIR-NEXT: addvl sp, sp, #4 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -429,20 +455,18 @@ define aarch64_sve_vector_pcs void @test_clobbers_z_regs() { ; PAIR-LABEL: test_clobbers_z_regs: ; PAIR: // %bb.0: ; PAIR-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; PAIR-NEXT: addvl sp, sp, #-3 -; PAIR-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: st1b { z8.b, z9.b }, pn8, [sp, #2, mul vl] // 32-byte Folded Spill -; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; PAIR-NEXT: addvl sp, sp, #-2 +; PAIR-NEXT: str z9, [sp] // 16-byte Folded Spill +; PAIR-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; PAIR-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; PAIR-NEXT: .cfi_offset w29, -16 ; PAIR-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG ; PAIR-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG ; PAIR-NEXT: //APP ; PAIR-NEXT: //NO_APP -; PAIR-NEXT: ptrue pn8.b -; PAIR-NEXT: ld1b { z8.b, z9.b }, pn8/z, [sp, #2, mul vl] // 32-byte Folded Reload -; PAIR-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload -; PAIR-NEXT: addvl sp, sp, #3 +; PAIR-NEXT: ldr z9, [sp] // 16-byte Folded Reload +; PAIR-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload +; PAIR-NEXT: addvl sp, sp, #2 ; PAIR-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; PAIR-NEXT: ret call void asm sideeffect "", "~{z8},~{z9}"()