diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 9a7cc283b5c15..d66a8a896bae4 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -570,6 +570,18 @@ def FeatureD128 : SubtargetFeature<"d128", "HasD128", "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)", [FeatureLSE128]>; +def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp", + "true", "Do not emit ldp">; + +def FeatureDisableStp : SubtargetFeature<"disable-stp", "HasDisableStp", + "true", "Do not emit stp">; + +def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedOnly", + "true", "In order to emit ldp, first check if the load will be aligned to 2 * element_size">; + +def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly", + "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">; + //===----------------------------------------------------------------------===// // Architectures. // @@ -1239,7 +1251,9 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", FeatureArithmeticBccFusion, FeatureCmpBccFusion, FeatureFuseAddress, - FeatureFuseLiterals]>; + FeatureFuseLiterals, + FeatureLdpAlignedOnly, + FeatureStpAlignedOnly]>; def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", "Ampere Computing Ampere-1A processors", [ @@ -1252,7 +1266,9 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A", FeatureCmpBccFusion, FeatureFuseAddress, FeatureFuseLiterals, - FeatureFuseLiterals]>; + FeatureFuseLiterals, + FeatureLdpAlignedOnly, + FeatureStpAlignedOnly]>; def ProcessorFeatures { list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 41af5522d967d..dc2965178bc28 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -2136,6 +2136,14 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { if (!TII->isCandidateToMergeOrPair(MI)) return false; + // If disable-ldp feature is opted, do not emit ldp. + if (MI.mayLoad() && Subtarget->hasDisableLdp()) + return false; + + // If disable-stp feature is opted, do not emit stp. + if (MI.mayStore() && Subtarget->hasDisableStp()) + return false; + // Early exit if the offset is not possible to match. (6 bits of positive // range, plus allow an extra one in case we find a later insn that matches // with Offset-1) @@ -2159,6 +2167,31 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. auto Prev = std::prev(MBBI); + + // Fetch the memoperand of the load/store that is a candidate for + // combination. + MachineMemOperand *MemOp = + MI.memoperands_empty() ? nullptr : MI.memoperands().front(); + + // Get the needed alignments to check them if + // ldp-aligned-only/stp-aligned-only features are opted. + uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1; + uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1; + + // If a load arrives and ldp-aligned-only feature is opted, check that the + // alignment of the source pointer is at least double the alignment of the + // type. + if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp && + MemAlignment < 2 * TypeAlignment) + return false; + + // If a store arrives and stp-aligned-only feature is opted, check that the + // alignment of the source pointer is at least double the alignment of the + // type. + if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp && + MemAlignment < 2 * TypeAlignment) + return false; + MBBI = mergePairedInsns(MBBI, Paired, Flags); // Collect liveness info for instructions between Prev and the new position // MBBI. diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll new file mode 100644 index 0000000000000..838df340b402e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll @@ -0,0 +1,389 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -O2 -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-DEFAULT +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-DISABLE-LDP +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-DISABLE-STP +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-DISABLE-LDP +; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-DISABLE-STP + +define i32 @ldp_aligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldp w9, w8, [x8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_aligned_int32_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DEFAULT-NEXT: ldp w9, w8, [x8] +; CHECK-DEFAULT-NEXT: add w0, w8, w9 +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_aligned_int32_t: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DISABLE-LDP-NEXT: ldr w9, [x8] +; CHECK-DISABLE-LDP-NEXT: ldr w8, [x8, #4] +; CHECK-DISABLE-LDP-NEXT: add w0, w8, w9 +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = load i32, ptr %4, align 64 + %6 = getelementptr inbounds i32, ptr %4, i64 1 + %7 = load i32, ptr %6, align 4 + %8 = add nsw i32 %7, %5 + ret i32 %8 +} + +define i64 @ldp_aligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldp x9, x8, [x8] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_aligned_int64_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DEFAULT-NEXT: ldp x9, x8, [x8] +; CHECK-DEFAULT-NEXT: add x0, x8, x9 +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_aligned_int64_t: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DISABLE-LDP-NEXT: ldr x9, [x8] +; CHECK-DISABLE-LDP-NEXT: ldr x8, [x8, #8] +; CHECK-DISABLE-LDP-NEXT: add x0, x8, x9 +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = load i64, ptr %4, align 128 + %6 = getelementptr inbounds i64, ptr %4, i64 1 + %7 = load i64, ptr %6, align 8 + %8 = add nsw i64 %7, %5 + ret i64 %8 +} + +define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldp q0, q1, [x8] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_aligned_v4si: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DEFAULT-NEXT: ldp q0, q1, [x8] +; CHECK-DEFAULT-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_aligned_v4si: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DISABLE-LDP-NEXT: ldr q0, [x8] +; CHECK-DISABLE-LDP-NEXT: ldr q1, [x8, #16] +; CHECK-DISABLE-LDP-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = load <4 x i32>, ptr %4, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %7 = load <4 x i32>, ptr %6, align 16 + %8 = add <4 x i32> %7, %5 + ret <4 x i32> %8 +} + +define i32 @ldp_unaligned_int32_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: ldr w9, [x8, #4] +; CHECK-NEXT: ldr w8, [x8, #8] +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_unaligned_int32_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DEFAULT-NEXT: ldp w9, w8, [x8, #4] +; CHECK-DEFAULT-NEXT: add w0, w8, w9 +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_int32_t: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DISABLE-LDP-NEXT: ldr w9, [x8, #4] +; CHECK-DISABLE-LDP-NEXT: ldr w8, [x8, #8] +; CHECK-DISABLE-LDP-NEXT: add w0, w8, w9 +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -64 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i32, ptr %4, i64 1 + %6 = load i32, ptr %5, align 4 + %7 = getelementptr inbounds i32, ptr %4, i64 2 + %8 = load i32, ptr %7, align 8 + %9 = add nsw i32 %8, %6 + ret i32 %9 +} + +define i64 @ldp_unaligned_int64_t(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: ldr x9, [x8, #8] +; CHECK-NEXT: ldr x8, [x8, #16] +; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_unaligned_int64_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DEFAULT-NEXT: ldp x9, x8, [x8, #8] +; CHECK-DEFAULT-NEXT: add x0, x8, x9 +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_int64_t: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DISABLE-LDP-NEXT: ldr x9, [x8, #8] +; CHECK-DISABLE-LDP-NEXT: ldr x8, [x8, #16] +; CHECK-DISABLE-LDP-NEXT: add x0, x8, x9 +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -128 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds i64, ptr %4, i64 1 + %6 = load i64, ptr %5, align 8 + %7 = getelementptr inbounds i64, ptr %4, i64 2 + %8 = load i64, ptr %7, align 16 + %9 = add nsw i64 %8, %6 + ret i64 %9 +} + +define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 { +; CHECK-LABEL: ldp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: ldr q0, [x8, #16] +; CHECK-NEXT: ldr q1, [x8, #32] +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: ldp_unaligned_v4si: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DEFAULT-NEXT: ldp q0, q1, [x8, #16] +; CHECK-DEFAULT-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-LDP-LABEL: ldp_unaligned_v4si: +; CHECK-DISABLE-LDP: // %bb.0: +; CHECK-DISABLE-LDP-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DISABLE-LDP-NEXT: ldr q0, [x8, #16] +; CHECK-DISABLE-LDP-NEXT: ldr q1, [x8, #32] +; CHECK-DISABLE-LDP-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-DISABLE-LDP-NEXT: ret + %2 = ptrtoint ptr %0 to i64 + %3 = and i64 %2, -256 + %4 = inttoptr i64 %3 to ptr + %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1 + %6 = load <4 x i32>, ptr %5, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2 + %8 = load <4 x i32>, ptr %7, align 32 + %9 = add <4 x i32> %8, %6 + ret <4 x i32> %9 +} + +define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_aligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-NEXT: stp w1, w1, [x0] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_aligned_int32_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-DEFAULT-NEXT: stp w1, w1, [x0] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_aligned_int32_t: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffffc0 +; CHECK-DISABLE-STP-NEXT: str w1, [x0] +; CHECK-DISABLE-STP-NEXT: str w1, [x0, #4] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + store i32 %1, ptr %5, align 64 + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + ret ptr %5 +} + +define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_aligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-NEXT: stp x1, x1, [x0] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_aligned_int64_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-DEFAULT-NEXT: stp x1, x1, [x0] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_aligned_int64_t: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffff80 +; CHECK-DISABLE-STP-NEXT: str x1, [x0] +; CHECK-DISABLE-STP-NEXT: str x1, [x0, #8] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + store i64 %1, ptr %5, align 128 + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + ret ptr %5 +} + +define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_aligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_aligned_v4si: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-DEFAULT-NEXT: stp q0, q0, [x0] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_aligned_v4si: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x0, x0, #0xffffffffffffff00 +; CHECK-DISABLE-STP-NEXT: str q0, [x0] +; CHECK-DISABLE-STP-NEXT: str q0, [x0, #16] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + store <4 x i32> %1, ptr %5, align 256 + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + ret ptr %5 +} + +define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 { +; CHECK-LABEL: stp_unaligned_int32_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-NEXT: orr x0, x8, #0x4 +; CHECK-NEXT: str w1, [x8, #4] +; CHECK-NEXT: str w1, [x8, #8] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_unaligned_int32_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DEFAULT-NEXT: orr x0, x8, #0x4 +; CHECK-DEFAULT-NEXT: stp w1, w1, [x8, #4] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_unaligned_int32_t: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffffc0 +; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x4 +; CHECK-DISABLE-STP-NEXT: str w1, [x8, #4] +; CHECK-DISABLE-STP-NEXT: str w1, [x8, #8] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -64 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i32, ptr %5, i64 1 + store i32 %1, ptr %6, align 4 + %7 = getelementptr inbounds i32, ptr %5, i64 2 + store i32 %1, ptr %7, align 8 + ret ptr %6 +} + +define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 { +; CHECK-LABEL: stp_unaligned_int64_t: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-NEXT: orr x0, x8, #0x8 +; CHECK-NEXT: str x1, [x8, #8] +; CHECK-NEXT: str x1, [x8, #16] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_unaligned_int64_t: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DEFAULT-NEXT: orr x0, x8, #0x8 +; CHECK-DEFAULT-NEXT: stp x1, x1, [x8, #8] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_unaligned_int64_t: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffff80 +; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x8 +; CHECK-DISABLE-STP-NEXT: str x1, [x8, #8] +; CHECK-DISABLE-STP-NEXT: str x1, [x8, #16] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -128 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds i64, ptr %5, i64 1 + store i64 %1, ptr %6, align 8 + %7 = getelementptr inbounds i64, ptr %5, i64 2 + store i64 %1, ptr %7, align 16 + ret ptr %6 +} + +define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 { +; CHECK-LABEL: stp_unaligned_v4si: +; CHECK: // %bb.0: +; CHECK-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-NEXT: orr x0, x8, #0x10 +; CHECK-NEXT: str q0, [x8, #16] +; CHECK-NEXT: str q0, [x8, #32] +; CHECK-NEXT: ret +; +; CHECK-DEFAULT-LABEL: stp_unaligned_v4si: +; CHECK-DEFAULT: // %bb.0: +; CHECK-DEFAULT-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DEFAULT-NEXT: orr x0, x8, #0x10 +; CHECK-DEFAULT-NEXT: stp q0, q0, [x8, #16] +; CHECK-DEFAULT-NEXT: ret +; +; CHECK-DISABLE-STP-LABEL: stp_unaligned_v4si: +; CHECK-DISABLE-STP: // %bb.0: +; CHECK-DISABLE-STP-NEXT: and x8, x0, #0xffffffffffffff00 +; CHECK-DISABLE-STP-NEXT: orr x0, x8, #0x10 +; CHECK-DISABLE-STP-NEXT: str q0, [x8, #16] +; CHECK-DISABLE-STP-NEXT: str q0, [x8, #32] +; CHECK-DISABLE-STP-NEXT: ret + %3 = ptrtoint ptr %0 to i64 + %4 = and i64 %3, -256 + %5 = inttoptr i64 %4 to ptr + %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1 + store <4 x i32> %1, ptr %6, align 16 + %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2 + store <4 x i32> %1, ptr %7, align 32 + ret ptr %6 +}