diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 45f61262faf93..41b888592cc18 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1868,12 +1868,16 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { // %reg.subidx. LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(mi->getOperand(0).getSubReg()); - SlotIndex Idx = LIS->getInstructionIndex(*mi); + SlotIndex Idx = LIS->getInstructionIndex(*mi).getRegSlot(); for (auto &S : LI.subranges()) { if ((S.LaneMask & LaneMask).none()) { - LiveRange::iterator UseSeg = S.FindSegmentContaining(Idx); - LiveRange::iterator DefSeg = std::next(UseSeg); - S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno); + LiveRange::iterator DefSeg = S.FindSegmentContaining(Idx); + if (mi->getOperand(0).isUndef()) { + S.removeValNo(DefSeg->valno); + } else { + LiveRange::iterator UseSeg = std::prev(DefSeg); + S.MergeValueNumberInto(DefSeg->valno, UseSeg->valno); + } } } diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll index 844da3baa42ba..3a477f987cee6 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) { ; CHECK-LABEL: sqrt_float32_t: @@ -1085,21 +1087,37 @@ entry: } define arm_aapcs_vfpcc <2 x double> @copysign_float64_t(<2 x double> %src1, <2 x double> %src2) { -; CHECK-LABEL: copysign_float64_t: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: vmov r0, r1, d3 -; CHECK-NEXT: vmov r0, lr, d2 -; CHECK-NEXT: vmov r0, r3, d1 -; CHECK-NEXT: vmov r12, r2, d0 -; CHECK-NEXT: lsrs r1, r1, #31 -; CHECK-NEXT: bfi r3, r1, #31, #1 -; CHECK-NEXT: lsr.w r1, lr, #31 -; CHECK-NEXT: bfi r2, r1, #31, #1 -; CHECK-NEXT: vmov d1, r0, r3 -; CHECK-NEXT: vmov d0, r12, r2 -; CHECK-NEXT: pop {r7, pc} +; CHECK-LV-LABEL: copysign_float64_t: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r7, lr} +; CHECK-LV-NEXT: push {r7, lr} +; CHECK-LV-NEXT: vmov r0, r1, d3 +; CHECK-LV-NEXT: vmov r0, lr, d2 +; CHECK-LV-NEXT: vmov r0, r3, d1 +; CHECK-LV-NEXT: vmov r12, r2, d0 +; CHECK-LV-NEXT: lsrs r1, r1, #31 +; CHECK-LV-NEXT: bfi r3, r1, #31, #1 +; CHECK-LV-NEXT: lsr.w r1, lr, #31 +; CHECK-LV-NEXT: bfi r2, r1, #31, #1 +; CHECK-LV-NEXT: vmov d1, r0, r3 +; CHECK-LV-NEXT: vmov d0, r12, r2 +; CHECK-LV-NEXT: pop {r7, pc} +; +; CHECK-LIS-LABEL: copysign_float64_t: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, lr} +; CHECK-LIS-NEXT: push {r4, lr} +; CHECK-LIS-NEXT: vmov r0, r12, d3 +; CHECK-LIS-NEXT: vmov r0, lr, d2 +; CHECK-LIS-NEXT: vmov r4, r3, d1 +; CHECK-LIS-NEXT: vmov r1, r2, d0 +; CHECK-LIS-NEXT: lsr.w r0, r12, #31 +; CHECK-LIS-NEXT: bfi r3, r0, #31, #1 +; CHECK-LIS-NEXT: lsr.w r0, lr, #31 +; CHECK-LIS-NEXT: bfi r2, r0, #31, #1 +; CHECK-LIS-NEXT: vmov d1, r4, r3 +; CHECK-LIS-NEXT: vmov d0, r1, r2 +; CHECK-LIS-NEXT: pop {r4, pc} entry: %0 = call fast <2 x double> @llvm.copysign.v2f64(<2 x double> %src1, <2 x double> %src2) ret <2 x double> %0 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index 93a058828765e..6e644c58687fa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECKFP +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LV,CHECKFP +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LIS,CHECKFP define arm_aapcs_vfpcc <4 x i32> @shuffle1_i32(<4 x i32> %src) { ; CHECK-LABEL: shuffle1_i32: @@ -221,18 +223,31 @@ entry: } define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) { -; CHECK-LABEL: shuffle3_i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s1, s7 -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle3_i16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vmov q1, q0 +; CHECK-LV-NEXT: vmovx.f16 s2, s5 +; CHECK-LV-NEXT: vmovx.f16 s0, s4 +; CHECK-LV-NEXT: vins.f16 s5, s4 +; CHECK-LV-NEXT: vins.f16 s2, s0 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vmovx.f16 s1, s7 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vins.f16 s1, s7 +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle3_i16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vmovx.f16 s5, s3 +; CHECK-LIS-NEXT: vmovx.f16 s6, s1 +; CHECK-LIS-NEXT: vmovx.f16 s4, s0 +; CHECK-LIS-NEXT: vins.f16 s1, s0 +; CHECK-LIS-NEXT: vins.f16 s6, s4 +; CHECK-LIS-NEXT: vins.f16 s5, s3 +; CHECK-LIS-NEXT: vmov.f32 s7, s1 +; CHECK-LIS-NEXT: vmov.f32 s4, s2 +; CHECK-LIS-NEXT: vmov q0, q1 +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> ret <8 x i16> %out @@ -476,42 +491,79 @@ entry: } define arm_aapcs_vfpcc <16 x i8> @shuffle3_i8(<16 x i8> %src) { -; CHECK-LABEL: shuffle3_i8: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov.u8 r0, q1[5] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov.u8 r0, q1[15] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov.u8 r0, q1[7] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov.u8 r0, q1[14] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov.u8 r0, q1[9] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov.u8 r0, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov.u8 r0, q1[3] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov.u8 r0, q1[10] -; CHECK-NEXT: vmov.8 q0[8], r0 -; CHECK-NEXT: vmov.u8 r0, q1[12] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.8 q0[10], r0 -; CHECK-NEXT: vmov.u8 r0, q1[13] -; CHECK-NEXT: vmov.8 q0[11], r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] -; CHECK-NEXT: vmov.8 q0[12], r0 -; CHECK-NEXT: vmov.u8 r0, q1[8] -; CHECK-NEXT: vmov.8 q0[13], r0 -; CHECK-NEXT: vmov.u8 r0, q1[0] -; CHECK-NEXT: vmov.8 q0[14], r0 -; CHECK-NEXT: vmov.u8 r0, q1[11] -; CHECK-NEXT: vmov.8 q0[15], r0 -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle3_i8: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vmov q1, q0 +; CHECK-LV-NEXT: vmov.u8 r0, q0[4] +; CHECK-LV-NEXT: vmov.8 q0[0], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[5] +; CHECK-LV-NEXT: vmov.8 q0[1], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[15] +; CHECK-LV-NEXT: vmov.8 q0[2], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[7] +; CHECK-LV-NEXT: vmov.8 q0[3], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[14] +; CHECK-LV-NEXT: vmov.8 q0[4], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[9] +; CHECK-LV-NEXT: vmov.8 q0[5], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[6] +; CHECK-LV-NEXT: vmov.8 q0[6], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[3] +; CHECK-LV-NEXT: vmov.8 q0[7], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[10] +; CHECK-LV-NEXT: vmov.8 q0[8], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[12] +; CHECK-LV-NEXT: vmov.8 q0[9], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[1] +; CHECK-LV-NEXT: vmov.8 q0[10], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[13] +; CHECK-LV-NEXT: vmov.8 q0[11], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[2] +; CHECK-LV-NEXT: vmov.8 q0[12], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[8] +; CHECK-LV-NEXT: vmov.8 q0[13], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[0] +; CHECK-LV-NEXT: vmov.8 q0[14], r0 +; CHECK-LV-NEXT: vmov.u8 r0, q1[11] +; CHECK-LV-NEXT: vmov.8 q0[15], r0 +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle3_i8: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vmov.u8 r0, q0[4] +; CHECK-LIS-NEXT: vmov.8 q1[0], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[5] +; CHECK-LIS-NEXT: vmov.8 q1[1], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[15] +; CHECK-LIS-NEXT: vmov.8 q1[2], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[7] +; CHECK-LIS-NEXT: vmov.8 q1[3], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[14] +; CHECK-LIS-NEXT: vmov.8 q1[4], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[9] +; CHECK-LIS-NEXT: vmov.8 q1[5], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[6] +; CHECK-LIS-NEXT: vmov.8 q1[6], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[3] +; CHECK-LIS-NEXT: vmov.8 q1[7], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[10] +; CHECK-LIS-NEXT: vmov.8 q1[8], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[12] +; CHECK-LIS-NEXT: vmov.8 q1[9], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[1] +; CHECK-LIS-NEXT: vmov.8 q1[10], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[13] +; CHECK-LIS-NEXT: vmov.8 q1[11], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[2] +; CHECK-LIS-NEXT: vmov.8 q1[12], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[8] +; CHECK-LIS-NEXT: vmov.8 q1[13], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[0] +; CHECK-LIS-NEXT: vmov.8 q1[14], r0 +; CHECK-LIS-NEXT: vmov.u8 r0, q0[11] +; CHECK-LIS-NEXT: vmov.8 q1[15], r0 +; CHECK-LIS-NEXT: vmov q0, q1 +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <16 x i8> %src, <16 x i8> undef, <16 x i32> ret <16 x i8> %out @@ -1143,18 +1195,31 @@ entry: } define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) { -; CHECK-LABEL: shuffle3_f16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmovx.f16 s2, s5 -; CHECK-NEXT: vmovx.f16 s0, s4 -; CHECK-NEXT: vins.f16 s5, s4 -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmovx.f16 s1, s7 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vins.f16 s1, s7 -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: shuffle3_f16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: vmov q1, q0 +; CHECK-LV-NEXT: vmovx.f16 s2, s5 +; CHECK-LV-NEXT: vmovx.f16 s0, s4 +; CHECK-LV-NEXT: vins.f16 s5, s4 +; CHECK-LV-NEXT: vins.f16 s2, s0 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vmovx.f16 s1, s7 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vins.f16 s1, s7 +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: shuffle3_f16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: vmovx.f16 s5, s3 +; CHECK-LIS-NEXT: vmovx.f16 s6, s1 +; CHECK-LIS-NEXT: vmovx.f16 s4, s0 +; CHECK-LIS-NEXT: vins.f16 s1, s0 +; CHECK-LIS-NEXT: vins.f16 s6, s4 +; CHECK-LIS-NEXT: vins.f16 s5, s3 +; CHECK-LIS-NEXT: vmov.f32 s7, s1 +; CHECK-LIS-NEXT: vmov.f32 s4, s2 +; CHECK-LIS-NEXT: vmov q0, q1 +; CHECK-LIS-NEXT: bx lr entry: %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> ret <8 x half> %out diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll index 4895eabb71ec0..8a94e571e9836 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LV +; RUN: llc -early-live-intervals -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK,CHECK-LIS ; i32 @@ -67,46 +68,87 @@ entry: } define void @vld3_v8i32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8i32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v8i32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s13, s4 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s14, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s12, s9 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s17 +; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s6, s16 +; CHECK-LIS-NEXT: vmov.f32 s7, s19 +; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x i32>, ptr %src, align 4 %s1 = shufflevector <24 x i32> %l1, <24 x i32> undef, <8 x i32> @@ -119,80 +161,155 @@ entry: } define void @vld3_v16i32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16i32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.i32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.i32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s23, s26 -; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.i32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vadd.i32 q2, q4, q2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s28, s16 -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.i32 q6, q7, q6 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.i32 q3, q6, q3 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16i32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vadd.i32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LV-NEXT: vadd.i32 q1, q4, q1 +; CHECK-LV-NEXT: vmov.f32 s18, s10 +; CHECK-LV-NEXT: vmov.f32 s21, s8 +; CHECK-LV-NEXT: vmov.f32 s22, s11 +; CHECK-LV-NEXT: vmov.f32 s16, s12 +; CHECK-LV-NEXT: vmov.f32 s17, s15 +; CHECK-LV-NEXT: vmov.f32 s20, s13 +; CHECK-LV-NEXT: vmov.f32 s23, s26 +; CHECK-LV-NEXT: vmov.f32 s19, s25 +; CHECK-LV-NEXT: vadd.i32 q4, q4, q5 +; CHECK-LV-NEXT: vmov.f32 s8, s14 +; CHECK-LV-NEXT: vmov.f32 s10, s24 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LV-NEXT: vmov.f32 s11, s27 +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LV-NEXT: vadd.i32 q2, q4, q2 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LV-NEXT: vmov.f32 s25, s12 +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LV-NEXT: vmov.f32 s26, s15 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s30, s14 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vmov.f32 s24, s17 +; CHECK-LV-NEXT: vmov.f32 s27, s22 +; CHECK-LV-NEXT: vmov.f32 s28, s16 +; CHECK-LV-NEXT: vmov.f32 s29, s19 +; CHECK-LV-NEXT: vmov.f32 s31, s21 +; CHECK-LV-NEXT: vadd.i32 q6, q7, q6 +; CHECK-LV-NEXT: vmov.f32 s12, s18 +; CHECK-LV-NEXT: vmov.f32 s14, s20 +; CHECK-LV-NEXT: vmov.f32 s15, s23 +; CHECK-LV-NEXT: vadd.i32 q3, q6, q3 +; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v16i32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.i32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vadd.i32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s13, s4 +; CHECK-LIS-NEXT: vmov.f32 s14, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s12, s9 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s17 +; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s7, s19 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LIS-NEXT: vmov.f32 s6, s16 +; CHECK-LIS-NEXT: vadd.i32 q1, q3, q1 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LIS-NEXT: vmov.f32 s18, s10 +; CHECK-LIS-NEXT: vmov.f32 s21, s8 +; CHECK-LIS-NEXT: vmov.f32 s22, s11 +; CHECK-LIS-NEXT: vmov.f32 s16, s12 +; CHECK-LIS-NEXT: vmov.f32 s17, s15 +; CHECK-LIS-NEXT: vmov.f32 s20, s13 +; CHECK-LIS-NEXT: vmov.f32 s23, s26 +; CHECK-LIS-NEXT: vmov.f32 s19, s25 +; CHECK-LIS-NEXT: vmov.f32 s8, s14 +; CHECK-LIS-NEXT: vadd.i32 q4, q4, q5 +; CHECK-LIS-NEXT: vmov.f32 s10, s24 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LIS-NEXT: vmov.f32 s11, s27 +; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-LIS-NEXT: vadd.i32 q2, q4, q2 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LIS-NEXT: vmov.f32 s21, s12 +; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LIS-NEXT: vmov.f32 s22, s15 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s30, s14 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vmov.f32 s20, s17 +; CHECK-LIS-NEXT: vmov.f32 s23, s26 +; CHECK-LIS-NEXT: vmov.f32 s28, s16 +; CHECK-LIS-NEXT: vmov.f32 s29, s19 +; CHECK-LIS-NEXT: vmov.f32 s31, s25 +; CHECK-LIS-NEXT: vadd.i32 q5, q7, q5 +; CHECK-LIS-NEXT: vmov.f32 s12, s18 +; CHECK-LIS-NEXT: vmov.f32 s14, s24 +; CHECK-LIS-NEXT: vmov.f32 s15, s27 +; CHECK-LIS-NEXT: vadd.i32 q3, q5, q3 +; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x i32>, ptr %src, align 4 %s1 = shufflevector <48 x i32> %l1, <48 x i32> undef, <16 x i32> @@ -247,32 +364,59 @@ entry: } define void @vld3_v4i16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v4i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrh.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.u16 r5, q0[6] -; CHECK-NEXT: vmov.u16 r6, q0[0] -; CHECK-NEXT: vmov r0, r3, d2 -; CHECK-NEXT: vmov.u16 lr, q0[2] -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r5 -; CHECK-NEXT: vmov.u16 r5, q0[7] -; CHECK-NEXT: vmov.u16 r6, q0[1] -; CHECK-NEXT: vmov q2[2], q2[0], r6, r5 -; CHECK-NEXT: vmov.u16 r5, q0[3] -; CHECK-NEXT: vmov.u16 r6, q0[4] -; CHECK-NEXT: vmov q1[3], q1[1], r5, r3 -; CHECK-NEXT: vmov q2[3], q2[1], r6, r2 -; CHECK-NEXT: vmov.u16 r12, q0[5] -; CHECK-NEXT: vadd.i32 q0, q1, q2 -; CHECK-NEXT: vmov q1[2], q1[0], lr, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r12, r4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrh.32 q0, [r1] -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-LV-LABEL: vld3_v4i16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, lr} +; CHECK-LV-NEXT: push {r4, r5, r6, lr} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.u16 r5, q0[6] +; CHECK-LV-NEXT: vmov.u16 r6, q0[0] +; CHECK-LV-NEXT: vmov r0, r3, d2 +; CHECK-LV-NEXT: vmov.u16 lr, q0[2] +; CHECK-LV-NEXT: vmov r2, r4, d3 +; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r5 +; CHECK-LV-NEXT: vmov.u16 r5, q0[7] +; CHECK-LV-NEXT: vmov.u16 r6, q0[1] +; CHECK-LV-NEXT: vmov q2[2], q2[0], r6, r5 +; CHECK-LV-NEXT: vmov.u16 r5, q0[3] +; CHECK-LV-NEXT: vmov.u16 r6, q0[4] +; CHECK-LV-NEXT: vmov q1[3], q1[1], r5, r3 +; CHECK-LV-NEXT: vmov q2[3], q2[1], r6, r2 +; CHECK-LV-NEXT: vmov.u16 r12, q0[5] +; CHECK-LV-NEXT: vadd.i32 q0, q1, q2 +; CHECK-LV-NEXT: vmov q1[2], q1[0], lr, r0 +; CHECK-LV-NEXT: vmov q1[3], q1[1], r12, r4 +; CHECK-LV-NEXT: vadd.i32 q0, q0, q1 +; CHECK-LV-NEXT: vstrh.32 q0, [r1] +; CHECK-LV-NEXT: pop {r4, r5, r6, pc} +; +; CHECK-LIS-LABEL: vld3_v4i16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, lr} +; CHECK-LIS-NEXT: push {r4, r5, r6, lr} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrh.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.u16 r5, q0[6] +; CHECK-LIS-NEXT: vmov.u16 r6, q0[0] +; CHECK-LIS-NEXT: vmov r0, r2, d2 +; CHECK-LIS-NEXT: vmov.u16 r12, q0[2] +; CHECK-LIS-NEXT: vmov r3, r4, d3 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r6, r5 +; CHECK-LIS-NEXT: vmov.u16 r5, q0[7] +; CHECK-LIS-NEXT: vmov.u16 r6, q0[1] +; CHECK-LIS-NEXT: vmov q2[2], q2[0], r6, r5 +; CHECK-LIS-NEXT: vmov.u16 r5, q0[3] +; CHECK-LIS-NEXT: vmov.u16 r6, q0[4] +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r5, r2 +; CHECK-LIS-NEXT: vmov q2[3], q2[1], r6, r3 +; CHECK-LIS-NEXT: vmov.u16 lr, q0[5] +; CHECK-LIS-NEXT: vadd.i32 q0, q1, q2 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r12, r0 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], lr, r4 +; CHECK-LIS-NEXT: vadd.i32 q0, q0, q1 +; CHECK-LIS-NEXT: vstrh.32 q0, [r1] +; CHECK-LIS-NEXT: pop {r4, r5, r6, pc} entry: %l1 = load <12 x i16>, ptr %src, align 4 %s1 = shufflevector <12 x i16> %l1, <12 x i16> undef, <4 x i32> @@ -340,86 +484,167 @@ entry: } define void @vld3_v16i16(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16i16: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #48] -; CHECK-NEXT: vldrw.u32 q2, [r0, #64] -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmovx.f16 s6, s2 -; CHECK-NEXT: vmov.f32 s4, s1 -; CHECK-NEXT: vins.f16 s4, s6 -; CHECK-NEXT: vmovx.f16 s6, s9 -; CHECK-NEXT: vmov.f32 s5, s8 -; CHECK-NEXT: vmovx.f16 s7, s12 -; CHECK-NEXT: vins.f16 s5, s6 -; CHECK-NEXT: vmov.f32 s6, s11 -; CHECK-NEXT: vins.f16 s6, s7 -; CHECK-NEXT: vmovx.f16 s16, s15 -; CHECK-NEXT: vmov.f32 s7, s14 -; CHECK-NEXT: vmovx.f16 s17, s3 -; CHECK-NEXT: vins.f16 s7, s16 -; CHECK-NEXT: vmovx.f16 s16, s0 -; CHECK-NEXT: vins.f16 s16, s2 -; CHECK-NEXT: vmovx.f16 s2, s1 -; CHECK-NEXT: vins.f16 s0, s2 -; CHECK-NEXT: vmovx.f16 s2, s8 -; CHECK-NEXT: vins.f16 s3, s2 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vmovx.f16 s8, s14 -; CHECK-NEXT: vmovx.f16 s18, s10 -; CHECK-NEXT: vmovx.f16 s19, s13 -; CHECK-NEXT: vins.f16 s10, s2 -; CHECK-NEXT: vins.f16 s13, s8 -; CHECK-NEXT: vmov.f32 s1, s3 -; CHECK-NEXT: vins.f16 s18, s12 -; CHECK-NEXT: vins.f16 s19, s15 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vldrw.u32 q3, [r0, #16] -; CHECK-NEXT: vins.f16 s17, s9 -; CHECK-NEXT: vmov.f32 s2, s10 -; CHECK-NEXT: vadd.i16 q0, q0, q4 -; CHECK-NEXT: vldrw.u32 q2, [r0, #32] -; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vmovx.f16 s6, s14 -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmov.f32 s22, s15 -; CHECK-NEXT: vmovx.f16 s8, s8 -; CHECK-NEXT: vins.f16 s22, s8 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vmov.f32 s23, s10 -; CHECK-NEXT: vmovx.f16 s4, s16 -; CHECK-NEXT: vins.f16 s23, s8 -; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vins.f16 s16, s8 -; CHECK-NEXT: vmovx.f16 s8, s12 -; CHECK-NEXT: vmovx.f16 s5, s19 -; CHECK-NEXT: vins.f16 s19, s8 -; CHECK-NEXT: vmovx.f16 s8, s15 -; CHECK-NEXT: vmovx.f16 s7, s9 -; CHECK-NEXT: vins.f16 s14, s8 -; CHECK-NEXT: vmovx.f16 s8, s10 -; CHECK-NEXT: vins.f16 s4, s18 -; CHECK-NEXT: vmov.f32 s20, s17 -; CHECK-NEXT: vmovx.f16 s18, s18 -; CHECK-NEXT: vins.f16 s9, s8 -; CHECK-NEXT: vins.f16 s5, s13 -; CHECK-NEXT: vins.f16 s20, s18 -; CHECK-NEXT: vmov.f32 s17, s19 -; CHECK-NEXT: vins.f16 s7, s11 -; CHECK-NEXT: vmovx.f16 s13, s13 -; CHECK-NEXT: vmov.f32 s21, s12 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vins.f16 s21, s13 -; CHECK-NEXT: vmov.f32 s19, s9 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vadd.i16 q1, q4, q1 -; CHECK-NEXT: vadd.i16 q1, q1, q5 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16i16: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmovx.f16 s6, s2 +; CHECK-LV-NEXT: vmov.f32 s4, s1 +; CHECK-LV-NEXT: vins.f16 s4, s6 +; CHECK-LV-NEXT: vmovx.f16 s6, s9 +; CHECK-LV-NEXT: vmov.f32 s5, s8 +; CHECK-LV-NEXT: vmovx.f16 s7, s12 +; CHECK-LV-NEXT: vins.f16 s5, s6 +; CHECK-LV-NEXT: vmov.f32 s6, s11 +; CHECK-LV-NEXT: vins.f16 s6, s7 +; CHECK-LV-NEXT: vmovx.f16 s16, s15 +; CHECK-LV-NEXT: vmov.f32 s7, s14 +; CHECK-LV-NEXT: vmovx.f16 s17, s3 +; CHECK-LV-NEXT: vins.f16 s7, s16 +; CHECK-LV-NEXT: vmovx.f16 s16, s0 +; CHECK-LV-NEXT: vins.f16 s16, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s1 +; CHECK-LV-NEXT: vins.f16 s0, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s8 +; CHECK-LV-NEXT: vins.f16 s3, s2 +; CHECK-LV-NEXT: vmovx.f16 s2, s11 +; CHECK-LV-NEXT: vmovx.f16 s8, s14 +; CHECK-LV-NEXT: vmovx.f16 s18, s10 +; CHECK-LV-NEXT: vmovx.f16 s19, s13 +; CHECK-LV-NEXT: vins.f16 s10, s2 +; CHECK-LV-NEXT: vins.f16 s13, s8 +; CHECK-LV-NEXT: vmov.f32 s1, s3 +; CHECK-LV-NEXT: vins.f16 s18, s12 +; CHECK-LV-NEXT: vins.f16 s19, s15 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LV-NEXT: vins.f16 s17, s9 +; CHECK-LV-NEXT: vmov.f32 s2, s10 +; CHECK-LV-NEXT: vadd.i16 q0, q0, q4 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-LV-NEXT: vadd.i16 q0, q0, q1 +; CHECK-LV-NEXT: vmovx.f16 s6, s14 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0] +; CHECK-LV-NEXT: vins.f16 s6, s8 +; CHECK-LV-NEXT: vmov.f32 s22, s15 +; CHECK-LV-NEXT: vmovx.f16 s8, s8 +; CHECK-LV-NEXT: vins.f16 s22, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s10 +; CHECK-LV-NEXT: vmovx.f16 s4, s16 +; CHECK-LV-NEXT: vins.f16 s23, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s17 +; CHECK-LV-NEXT: vins.f16 s16, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s12 +; CHECK-LV-NEXT: vmovx.f16 s5, s19 +; CHECK-LV-NEXT: vins.f16 s19, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s15 +; CHECK-LV-NEXT: vmovx.f16 s7, s9 +; CHECK-LV-NEXT: vins.f16 s14, s8 +; CHECK-LV-NEXT: vmovx.f16 s8, s10 +; CHECK-LV-NEXT: vins.f16 s4, s18 +; CHECK-LV-NEXT: vmov.f32 s20, s17 +; CHECK-LV-NEXT: vmovx.f16 s18, s18 +; CHECK-LV-NEXT: vins.f16 s9, s8 +; CHECK-LV-NEXT: vins.f16 s5, s13 +; CHECK-LV-NEXT: vins.f16 s20, s18 +; CHECK-LV-NEXT: vmov.f32 s17, s19 +; CHECK-LV-NEXT: vins.f16 s7, s11 +; CHECK-LV-NEXT: vmovx.f16 s13, s13 +; CHECK-LV-NEXT: vmov.f32 s21, s12 +; CHECK-LV-NEXT: vmov.f32 s18, s14 +; CHECK-LV-NEXT: vins.f16 s21, s13 +; CHECK-LV-NEXT: vmov.f32 s19, s9 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vadd.i16 q1, q4, q1 +; CHECK-LV-NEXT: vadd.i16 q1, q1, q5 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v16i16: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmovx.f16 s6, s2 +; CHECK-LIS-NEXT: vmov.f32 s4, s1 +; CHECK-LIS-NEXT: vins.f16 s4, s6 +; CHECK-LIS-NEXT: vmovx.f16 s6, s9 +; CHECK-LIS-NEXT: vmov.f32 s5, s8 +; CHECK-LIS-NEXT: vmovx.f16 s7, s12 +; CHECK-LIS-NEXT: vins.f16 s5, s6 +; CHECK-LIS-NEXT: vmov.f32 s6, s11 +; CHECK-LIS-NEXT: vins.f16 s6, s7 +; CHECK-LIS-NEXT: vmovx.f16 s16, s15 +; CHECK-LIS-NEXT: vmov.f32 s7, s14 +; CHECK-LIS-NEXT: vmovx.f16 s17, s3 +; CHECK-LIS-NEXT: vins.f16 s7, s16 +; CHECK-LIS-NEXT: vmovx.f16 s16, s0 +; CHECK-LIS-NEXT: vins.f16 s16, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s1 +; CHECK-LIS-NEXT: vins.f16 s0, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s8 +; CHECK-LIS-NEXT: vins.f16 s3, s2 +; CHECK-LIS-NEXT: vmovx.f16 s2, s11 +; CHECK-LIS-NEXT: vmovx.f16 s8, s14 +; CHECK-LIS-NEXT: vmovx.f16 s18, s10 +; CHECK-LIS-NEXT: vmovx.f16 s19, s13 +; CHECK-LIS-NEXT: vins.f16 s10, s2 +; CHECK-LIS-NEXT: vins.f16 s13, s8 +; CHECK-LIS-NEXT: vmov.f32 s1, s3 +; CHECK-LIS-NEXT: vins.f16 s18, s12 +; CHECK-LIS-NEXT: vins.f16 s19, s15 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vins.f16 s17, s9 +; CHECK-LIS-NEXT: vmov.f32 s2, s10 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-LIS-NEXT: vadd.i16 q0, q0, q4 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0] +; CHECK-LIS-NEXT: vadd.i16 q0, q0, q1 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LIS-NEXT: vmovx.f16 s10, s14 +; CHECK-LIS-NEXT: vmov.f32 s22, s15 +; CHECK-LIS-NEXT: vins.f16 s10, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s4 +; CHECK-LIS-NEXT: vins.f16 s22, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s7 +; CHECK-LIS-NEXT: vmov.f32 s23, s6 +; CHECK-LIS-NEXT: vmovx.f16 s8, s16 +; CHECK-LIS-NEXT: vins.f16 s23, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s17 +; CHECK-LIS-NEXT: vins.f16 s16, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s12 +; CHECK-LIS-NEXT: vmovx.f16 s9, s19 +; CHECK-LIS-NEXT: vins.f16 s19, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s15 +; CHECK-LIS-NEXT: vmovx.f16 s11, s5 +; CHECK-LIS-NEXT: vins.f16 s14, s4 +; CHECK-LIS-NEXT: vmovx.f16 s4, s6 +; CHECK-LIS-NEXT: vins.f16 s8, s18 +; CHECK-LIS-NEXT: vmov.f32 s20, s17 +; CHECK-LIS-NEXT: vmovx.f16 s18, s18 +; CHECK-LIS-NEXT: vins.f16 s5, s4 +; CHECK-LIS-NEXT: vins.f16 s9, s13 +; CHECK-LIS-NEXT: vins.f16 s20, s18 +; CHECK-LIS-NEXT: vmov.f32 s17, s19 +; CHECK-LIS-NEXT: vins.f16 s11, s7 +; CHECK-LIS-NEXT: vmovx.f16 s13, s13 +; CHECK-LIS-NEXT: vmov.f32 s21, s12 +; CHECK-LIS-NEXT: vmov.f32 s18, s14 +; CHECK-LIS-NEXT: vins.f16 s21, s13 +; CHECK-LIS-NEXT: vmov.f32 s19, s5 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vadd.i16 q1, q4, q2 +; CHECK-LIS-NEXT: vadd.i16 q1, q1, q5 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x i16>, ptr %src, align 4 %s1 = shufflevector <48 x i16> %l1, <48 x i16> undef, <16 x i32> @@ -710,35 +935,65 @@ entry: ; i64 define void @vld3_v2i64(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v2i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vmov.f32 s12, s2 -; CHECK-NEXT: vmov.f32 s13, s3 -; CHECK-NEXT: vmov.f32 s2, s4 -; CHECK-NEXT: vmov.f32 s3, s5 -; CHECK-NEXT: vmov r0, r3, d5 -; CHECK-NEXT: vmov r2, r4, d3 -; CHECK-NEXT: vmov r6, r7, d0 -; CHECK-NEXT: vmov r5, r8, d6 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: adds.w r0, r0, lr -; CHECK-NEXT: adc.w r3, r3, r12 -; CHECK-NEXT: adds r0, r0, r2 -; CHECK-NEXT: adc.w r2, r3, r4 -; CHECK-NEXT: vmov r3, r4, d4 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: adc.w r7, r7, r8 -; CHECK-NEXT: adds r3, r3, r6 -; CHECK-NEXT: adcs r7, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r7, r2 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LV-LABEL: vld3_v2i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #32] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s12, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s3 +; CHECK-LV-NEXT: vmov.f32 s2, s4 +; CHECK-LV-NEXT: vmov.f32 s3, s5 +; CHECK-LV-NEXT: vmov r0, r3, d5 +; CHECK-LV-NEXT: vmov r2, r4, d3 +; CHECK-LV-NEXT: vmov r6, r7, d0 +; CHECK-LV-NEXT: vmov r5, r8, d6 +; CHECK-LV-NEXT: vmov lr, r12, d1 +; CHECK-LV-NEXT: adds.w r0, r0, lr +; CHECK-LV-NEXT: adc.w r3, r3, r12 +; CHECK-LV-NEXT: adds r0, r0, r2 +; CHECK-LV-NEXT: adc.w r2, r3, r4 +; CHECK-LV-NEXT: vmov r3, r4, d4 +; CHECK-LV-NEXT: adds r6, r6, r5 +; CHECK-LV-NEXT: adc.w r7, r7, r8 +; CHECK-LV-NEXT: adds r3, r3, r6 +; CHECK-LV-NEXT: adcs r7, r4 +; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LV-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; +; CHECK-LIS-LABEL: vld3_v2i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s12, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s3 +; CHECK-LIS-NEXT: vmov.f32 s2, s8 +; CHECK-LIS-NEXT: vmov.f32 s3, s9 +; CHECK-LIS-NEXT: vmov r0, r3, d3 +; CHECK-LIS-NEXT: vmov r2, r4, d5 +; CHECK-LIS-NEXT: vmov r6, r7, d0 +; CHECK-LIS-NEXT: vmov r5, r8, d6 +; CHECK-LIS-NEXT: vmov lr, r12, d1 +; CHECK-LIS-NEXT: adds.w r0, r0, lr +; CHECK-LIS-NEXT: adc.w r3, r3, r12 +; CHECK-LIS-NEXT: adds r0, r0, r2 +; CHECK-LIS-NEXT: adc.w r2, r3, r4 +; CHECK-LIS-NEXT: vmov r3, r4, d2 +; CHECK-LIS-NEXT: adds r6, r6, r5 +; CHECK-LIS-NEXT: adc.w r7, r7, r8 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r4 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r3, r0 +; CHECK-LIS-NEXT: vmov q0[3], q0[1], r7, r2 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <6 x i64>, ptr %src, align 4 %s1 = shufflevector <6 x i64> %l1, <6 x i64> undef, <2 x i32> @@ -751,62 +1006,120 @@ entry: } define void @vld3_v4i64(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v4i64: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vldrw.u32 q2, [r0, #16] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] -; CHECK-NEXT: vmov.f32 s4, s2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #64] -; CHECK-NEXT: vmov.f32 s5, s3 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r5, r4, d5 -; CHECK-NEXT: vmov r3, r8, d7 -; CHECK-NEXT: vldrw.u32 q3, [r0, #80] -; CHECK-NEXT: vmov.f32 s24, s22 -; CHECK-NEXT: vmov.f32 s25, s23 -; CHECK-NEXT: vmov lr, r12, d1 -; CHECK-NEXT: vmov.f32 s2, s12 -; CHECK-NEXT: vmov.f32 s3, s13 -; CHECK-NEXT: vmov r6, r7, d12 -; CHECK-NEXT: adds.w r0, r5, lr -; CHECK-NEXT: adc.w r5, r4, r12 -; CHECK-NEXT: adds.w lr, r0, r3 -; CHECK-NEXT: vmov r4, r2, d10 -; CHECK-NEXT: adc.w r12, r5, r8 -; CHECK-NEXT: vmov r5, r0, d8 -; CHECK-NEXT: adds r6, r6, r4 -; CHECK-NEXT: adcs r2, r7 -; CHECK-NEXT: adds r6, r6, r5 -; CHECK-NEXT: adc.w r8, r2, r0 -; CHECK-NEXT: vmov r7, r4, d1 -; CHECK-NEXT: vmov r2, r5, d9 -; CHECK-NEXT: vmov r3, r0, d0 -; CHECK-NEXT: adds r2, r2, r7 -; CHECK-NEXT: adc.w r7, r5, r4 -; CHECK-NEXT: vmov r5, r4, d7 -; CHECK-NEXT: adds r2, r2, r5 -; CHECK-NEXT: adcs r7, r4 -; CHECK-NEXT: vmov r5, r4, d2 -; CHECK-NEXT: vmov q1[2], q1[0], r6, r2 -; CHECK-NEXT: vmov q1[3], q1[1], r8, r7 -; CHECK-NEXT: vstrw.32 q1, [r1, #16] -; CHECK-NEXT: adds r3, r3, r5 -; CHECK-NEXT: adcs r0, r4 -; CHECK-NEXT: vmov r4, r5, d4 -; CHECK-NEXT: adds r3, r3, r4 -; CHECK-NEXT: vmov q0[2], q0[0], r3, lr -; CHECK-NEXT: adcs r0, r5 -; CHECK-NEXT: vmov q0[3], q0[1], r0, r12 -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-LV-LABEL: vld3_v4i64: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0] +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-LV-NEXT: vmov.f32 s4, s2 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-LV-NEXT: vmov.f32 s5, s3 +; CHECK-LV-NEXT: vmov.f32 s2, s12 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vmov r5, r4, d5 +; CHECK-LV-NEXT: vmov r3, r8, d7 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s24, s22 +; CHECK-LV-NEXT: vmov.f32 s25, s23 +; CHECK-LV-NEXT: vmov lr, r12, d1 +; CHECK-LV-NEXT: vmov.f32 s2, s12 +; CHECK-LV-NEXT: vmov.f32 s3, s13 +; CHECK-LV-NEXT: vmov r6, r7, d12 +; CHECK-LV-NEXT: adds.w r0, r5, lr +; CHECK-LV-NEXT: adc.w r5, r4, r12 +; CHECK-LV-NEXT: adds.w lr, r0, r3 +; CHECK-LV-NEXT: vmov r4, r2, d10 +; CHECK-LV-NEXT: adc.w r12, r5, r8 +; CHECK-LV-NEXT: vmov r5, r0, d8 +; CHECK-LV-NEXT: adds r6, r6, r4 +; CHECK-LV-NEXT: adcs r2, r7 +; CHECK-LV-NEXT: adds r6, r6, r5 +; CHECK-LV-NEXT: adc.w r8, r2, r0 +; CHECK-LV-NEXT: vmov r7, r4, d1 +; CHECK-LV-NEXT: vmov r2, r5, d9 +; CHECK-LV-NEXT: vmov r3, r0, d0 +; CHECK-LV-NEXT: adds r2, r2, r7 +; CHECK-LV-NEXT: adc.w r7, r5, r4 +; CHECK-LV-NEXT: vmov r5, r4, d7 +; CHECK-LV-NEXT: adds r2, r2, r5 +; CHECK-LV-NEXT: adcs r7, r4 +; CHECK-LV-NEXT: vmov r5, r4, d2 +; CHECK-LV-NEXT: vmov q1[2], q1[0], r6, r2 +; CHECK-LV-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LV-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LV-NEXT: adds r3, r3, r5 +; CHECK-LV-NEXT: adcs r0, r4 +; CHECK-LV-NEXT: vmov r4, r5, d4 +; CHECK-LV-NEXT: adds r3, r3, r4 +; CHECK-LV-NEXT: vmov q0[2], q0[0], r3, lr +; CHECK-LV-NEXT: adcs r0, r5 +; CHECK-LV-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-LV-NEXT: vstrw.32 q0, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-LV-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; +; CHECK-LIS-LABEL: vld3_v4i64: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0] +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-LIS-NEXT: vmov.f32 s8, s2 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #64] +; CHECK-LIS-NEXT: vmov.f32 s9, s3 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vmov r2, r3, d3 +; CHECK-LIS-NEXT: vmov r4, r8, d7 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s24, s22 +; CHECK-LIS-NEXT: vmov.f32 s25, s23 +; CHECK-LIS-NEXT: vmov.f32 s7, s19 +; CHECK-LIS-NEXT: vmov lr, r12, d1 +; CHECK-LIS-NEXT: vmov.f32 s2, s12 +; CHECK-LIS-NEXT: vmov.f32 s3, s13 +; CHECK-LIS-NEXT: vmov r6, r7, d12 +; CHECK-LIS-NEXT: adds.w r0, r2, lr +; CHECK-LIS-NEXT: adc.w r2, r3, r12 +; CHECK-LIS-NEXT: adds.w lr, r0, r4 +; CHECK-LIS-NEXT: vmov r3, r5, d10 +; CHECK-LIS-NEXT: adc.w r12, r2, r8 +; CHECK-LIS-NEXT: vmov r2, r0, d8 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r5 +; CHECK-LIS-NEXT: adds r2, r2, r3 +; CHECK-LIS-NEXT: adc.w r8, r7, r0 +; CHECK-LIS-NEXT: vmov r6, r5, d1 +; CHECK-LIS-NEXT: vmov r3, r7, d3 +; CHECK-LIS-NEXT: vmov r4, r0, d0 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r5 +; CHECK-LIS-NEXT: vmov r6, r5, d7 +; CHECK-LIS-NEXT: adds r3, r3, r6 +; CHECK-LIS-NEXT: adcs r7, r5 +; CHECK-LIS-NEXT: vmov r6, r5, d4 +; CHECK-LIS-NEXT: adds r4, r4, r6 +; CHECK-LIS-NEXT: adcs r0, r5 +; CHECK-LIS-NEXT: vmov r5, r6, d2 +; CHECK-LIS-NEXT: vmov q1[2], q1[0], r2, r3 +; CHECK-LIS-NEXT: vmov q1[3], q1[1], r8, r7 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1, #16] +; CHECK-LIS-NEXT: adds r4, r4, r5 +; CHECK-LIS-NEXT: vmov q0[2], q0[0], r4, lr +; CHECK-LIS-NEXT: adcs r0, r6 +; CHECK-LIS-NEXT: vmov q0[3], q0[1], r0, r12 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12} +; CHECK-LIS-NEXT: pop.w {r4, r5, r6, r7, r8, pc} entry: %l1 = load <12 x i64>, ptr %src, align 4 %s1 = shufflevector <12 x i64> %l1, <12 x i64> undef, <4 x i32> @@ -881,46 +1194,87 @@ entry: } define void @vld3_v8f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v8f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.f32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v8f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v8f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s13, s4 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s14, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s12, s9 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s17 +; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s6, s16 +; CHECK-LIS-NEXT: vmov.f32 s7, s19 +; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <24 x float>, ptr %src, align 4 %s1 = shufflevector <24 x float> %l1, <24 x float> undef, <8 x i32> @@ -933,80 +1287,155 @@ entry: } define void @vld3_v16f32(ptr %src, ptr %dst) { -; CHECK-LABEL: vld3_v16f32: -; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrw.u32 q0, [r0, #64] -; CHECK-NEXT: vldrw.u32 q1, [r0, #48] -; CHECK-NEXT: vldrw.u32 q4, [r0, #80] -; CHECK-NEXT: vldrw.u32 q6, [r0, #176] -; CHECK-NEXT: vmov.f32 s10, s2 -; CHECK-NEXT: vmov.f32 s13, s0 -; CHECK-NEXT: vmov.f32 s14, s3 -; CHECK-NEXT: vmov.f32 s8, s4 -; CHECK-NEXT: vmov.f32 s9, s7 -; CHECK-NEXT: vmov.f32 s12, s5 -; CHECK-NEXT: vmov.f32 s15, s18 -; CHECK-NEXT: vmov.f32 s11, s17 -; CHECK-NEXT: vadd.f32 q2, q2, q3 -; CHECK-NEXT: vmov.f32 s0, s6 -; CHECK-NEXT: vmov.f32 s2, s16 -; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vmov.f32 s3, s19 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] -; CHECK-NEXT: vadd.f32 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmov.f32 s18, s7 -; CHECK-NEXT: vmov.f32 s22, s6 -; CHECK-NEXT: vmov.f32 s16, s9 -; CHECK-NEXT: vmov.f32 s19, s14 -; CHECK-NEXT: vmov.f32 s20, s8 -; CHECK-NEXT: vmov.f32 s21, s11 -; CHECK-NEXT: vmov.f32 s23, s13 -; CHECK-NEXT: vmov.f32 s4, s10 -; CHECK-NEXT: vldrw.u32 q2, [r0, #160] -; CHECK-NEXT: vmov.f32 s6, s12 -; CHECK-NEXT: vadd.f32 q4, q5, q4 -; CHECK-NEXT: vmov.f32 s7, s15 -; CHECK-NEXT: vldrw.u32 q3, [r0, #144] -; CHECK-NEXT: vadd.f32 q1, q4, q1 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vmov.f32 s21, s8 -; CHECK-NEXT: vmov.f32 s22, s11 -; CHECK-NEXT: vmov.f32 s16, s12 -; CHECK-NEXT: vmov.f32 s17, s15 -; CHECK-NEXT: vmov.f32 s20, s13 -; CHECK-NEXT: vmov.f32 s23, s26 -; CHECK-NEXT: vmov.f32 s19, s25 -; CHECK-NEXT: vadd.f32 q4, q4, q5 -; CHECK-NEXT: vmov.f32 s8, s14 -; CHECK-NEXT: vmov.f32 s10, s24 -; CHECK-NEXT: vldrw.u32 q3, [r0, #112] -; CHECK-NEXT: vmov.f32 s11, s27 -; CHECK-NEXT: vldrw.u32 q5, [r0, #128] -; CHECK-NEXT: vadd.f32 q2, q4, q2 -; CHECK-NEXT: vldrw.u32 q4, [r0, #96] -; CHECK-NEXT: vmov.f32 s25, s12 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s26, s15 -; CHECK-NEXT: vstrw.32 q0, [r1, #16] -; CHECK-NEXT: vmov.f32 s30, s14 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s24, s17 -; CHECK-NEXT: vmov.f32 s27, s22 -; CHECK-NEXT: vmov.f32 s28, s16 -; CHECK-NEXT: vmov.f32 s29, s19 -; CHECK-NEXT: vmov.f32 s31, s21 -; CHECK-NEXT: vadd.f32 q6, q7, q6 -; CHECK-NEXT: vmov.f32 s12, s18 -; CHECK-NEXT: vmov.f32 s14, s20 -; CHECK-NEXT: vmov.f32 s15, s23 -; CHECK-NEXT: vadd.f32 q3, q6, q3 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-LV-LABEL: vld3_v16f32: +; CHECK-LV: @ %bb.0: @ %entry +; CHECK-LV-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LV-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LV-NEXT: vmov.f32 s10, s2 +; CHECK-LV-NEXT: vmov.f32 s13, s0 +; CHECK-LV-NEXT: vmov.f32 s14, s3 +; CHECK-LV-NEXT: vmov.f32 s8, s4 +; CHECK-LV-NEXT: vmov.f32 s9, s7 +; CHECK-LV-NEXT: vmov.f32 s12, s5 +; CHECK-LV-NEXT: vmov.f32 s15, s18 +; CHECK-LV-NEXT: vmov.f32 s11, s17 +; CHECK-LV-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LV-NEXT: vmov.f32 s0, s6 +; CHECK-LV-NEXT: vmov.f32 s2, s16 +; CHECK-LV-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LV-NEXT: vmov.f32 s3, s19 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-LV-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0] +; CHECK-LV-NEXT: vmov.f32 s17, s4 +; CHECK-LV-NEXT: vmov.f32 s18, s7 +; CHECK-LV-NEXT: vmov.f32 s22, s6 +; CHECK-LV-NEXT: vmov.f32 s16, s9 +; CHECK-LV-NEXT: vmov.f32 s19, s14 +; CHECK-LV-NEXT: vmov.f32 s20, s8 +; CHECK-LV-NEXT: vmov.f32 s21, s11 +; CHECK-LV-NEXT: vmov.f32 s23, s13 +; CHECK-LV-NEXT: vmov.f32 s4, s10 +; CHECK-LV-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LV-NEXT: vmov.f32 s6, s12 +; CHECK-LV-NEXT: vadd.f32 q4, q5, q4 +; CHECK-LV-NEXT: vmov.f32 s7, s15 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LV-NEXT: vadd.f32 q1, q4, q1 +; CHECK-LV-NEXT: vmov.f32 s18, s10 +; CHECK-LV-NEXT: vmov.f32 s21, s8 +; CHECK-LV-NEXT: vmov.f32 s22, s11 +; CHECK-LV-NEXT: vmov.f32 s16, s12 +; CHECK-LV-NEXT: vmov.f32 s17, s15 +; CHECK-LV-NEXT: vmov.f32 s20, s13 +; CHECK-LV-NEXT: vmov.f32 s23, s26 +; CHECK-LV-NEXT: vmov.f32 s19, s25 +; CHECK-LV-NEXT: vadd.f32 q4, q4, q5 +; CHECK-LV-NEXT: vmov.f32 s8, s14 +; CHECK-LV-NEXT: vmov.f32 s10, s24 +; CHECK-LV-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LV-NEXT: vmov.f32 s11, s27 +; CHECK-LV-NEXT: vldrw.u32 q5, [r0, #128] +; CHECK-LV-NEXT: vadd.f32 q2, q4, q2 +; CHECK-LV-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LV-NEXT: vmov.f32 s25, s12 +; CHECK-LV-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LV-NEXT: vmov.f32 s26, s15 +; CHECK-LV-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LV-NEXT: vmov.f32 s30, s14 +; CHECK-LV-NEXT: vstrw.32 q1, [r1] +; CHECK-LV-NEXT: vmov.f32 s24, s17 +; CHECK-LV-NEXT: vmov.f32 s27, s22 +; CHECK-LV-NEXT: vmov.f32 s28, s16 +; CHECK-LV-NEXT: vmov.f32 s29, s19 +; CHECK-LV-NEXT: vmov.f32 s31, s21 +; CHECK-LV-NEXT: vadd.f32 q6, q7, q6 +; CHECK-LV-NEXT: vmov.f32 s12, s18 +; CHECK-LV-NEXT: vmov.f32 s14, s20 +; CHECK-LV-NEXT: vmov.f32 s15, s23 +; CHECK-LV-NEXT: vadd.f32 q3, q6, q3 +; CHECK-LV-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LV-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LV-NEXT: bx lr +; +; CHECK-LIS-LABEL: vld3_v16f32: +; CHECK-LIS: @ %bb.0: @ %entry +; CHECK-LIS-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: vldrw.u32 q0, [r0, #64] +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #80] +; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #176] +; CHECK-LIS-NEXT: vmov.f32 s10, s2 +; CHECK-LIS-NEXT: vmov.f32 s13, s0 +; CHECK-LIS-NEXT: vmov.f32 s14, s3 +; CHECK-LIS-NEXT: vmov.f32 s8, s4 +; CHECK-LIS-NEXT: vmov.f32 s9, s7 +; CHECK-LIS-NEXT: vmov.f32 s12, s5 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s11, s17 +; CHECK-LIS-NEXT: vmov.f32 s0, s6 +; CHECK-LIS-NEXT: vadd.f32 q2, q2, q3 +; CHECK-LIS-NEXT: vmov.f32 s2, s16 +; CHECK-LIS-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-LIS-NEXT: vmov.f32 s3, s19 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #32] +; CHECK-LIS-NEXT: vadd.f32 q0, q2, q0 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0] +; CHECK-LIS-NEXT: vmov.f32 s13, s4 +; CHECK-LIS-NEXT: vmov.f32 s14, s7 +; CHECK-LIS-NEXT: vmov.f32 s22, s6 +; CHECK-LIS-NEXT: vmov.f32 s12, s9 +; CHECK-LIS-NEXT: vmov.f32 s15, s18 +; CHECK-LIS-NEXT: vmov.f32 s20, s8 +; CHECK-LIS-NEXT: vmov.f32 s21, s11 +; CHECK-LIS-NEXT: vmov.f32 s23, s17 +; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 +; CHECK-LIS-NEXT: vmov.f32 s4, s10 +; CHECK-LIS-NEXT: vmov.f32 s7, s19 +; CHECK-LIS-NEXT: vldrw.u32 q2, [r0, #160] +; CHECK-LIS-NEXT: vmov.f32 s6, s16 +; CHECK-LIS-NEXT: vadd.f32 q1, q3, q1 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #144] +; CHECK-LIS-NEXT: vmov.f32 s18, s10 +; CHECK-LIS-NEXT: vmov.f32 s21, s8 +; CHECK-LIS-NEXT: vmov.f32 s22, s11 +; CHECK-LIS-NEXT: vmov.f32 s16, s12 +; CHECK-LIS-NEXT: vmov.f32 s17, s15 +; CHECK-LIS-NEXT: vmov.f32 s20, s13 +; CHECK-LIS-NEXT: vmov.f32 s23, s26 +; CHECK-LIS-NEXT: vmov.f32 s19, s25 +; CHECK-LIS-NEXT: vmov.f32 s8, s14 +; CHECK-LIS-NEXT: vadd.f32 q4, q4, q5 +; CHECK-LIS-NEXT: vmov.f32 s10, s24 +; CHECK-LIS-NEXT: vldrw.u32 q3, [r0, #112] +; CHECK-LIS-NEXT: vmov.f32 s11, s27 +; CHECK-LIS-NEXT: vldrw.u32 q6, [r0, #128] +; CHECK-LIS-NEXT: vadd.f32 q2, q4, q2 +; CHECK-LIS-NEXT: vldrw.u32 q4, [r0, #96] +; CHECK-LIS-NEXT: vmov.f32 s21, s12 +; CHECK-LIS-NEXT: vstrw.32 q2, [r1, #48] +; CHECK-LIS-NEXT: vmov.f32 s22, s15 +; CHECK-LIS-NEXT: vstrw.32 q0, [r1, #16] +; CHECK-LIS-NEXT: vmov.f32 s30, s14 +; CHECK-LIS-NEXT: vstrw.32 q1, [r1] +; CHECK-LIS-NEXT: vmov.f32 s20, s17 +; CHECK-LIS-NEXT: vmov.f32 s23, s26 +; CHECK-LIS-NEXT: vmov.f32 s28, s16 +; CHECK-LIS-NEXT: vmov.f32 s29, s19 +; CHECK-LIS-NEXT: vmov.f32 s31, s25 +; CHECK-LIS-NEXT: vadd.f32 q5, q7, q5 +; CHECK-LIS-NEXT: vmov.f32 s12, s18 +; CHECK-LIS-NEXT: vmov.f32 s14, s24 +; CHECK-LIS-NEXT: vmov.f32 s15, s27 +; CHECK-LIS-NEXT: vadd.f32 q3, q5, q3 +; CHECK-LIS-NEXT: vstrw.32 q3, [r1, #32] +; CHECK-LIS-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-LIS-NEXT: bx lr entry: %l1 = load <48 x float>, ptr %src, align 4 %s1 = shufflevector <48 x float> %l1, <48 x float> undef, <16 x i32>