From b3f6eb4db6b0cb39a1071800a3ced61b960c7d34 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 27 Jan 2025 13:10:14 +0000
Subject: [PATCH] [AArch64] Avoid generating LDAPUR on certain cores

On the CPUs listed below, we want to avoid LDAPUR for performance reasons. Add
a tuning feature to disable them when using:
 -mcpu=neoverse-v2
 -mcpu=neoverse-v3
 -mcpu=cortex-x3
 -mcpu=cortex-x4
 -mcpu=cortex-x925
---
 llvm/lib/Target/AArch64/AArch64Features.td    |   6 +-
 .../lib/Target/AArch64/AArch64InstrAtomics.td |   4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   2 +
 llvm/lib/Target/AArch64/AArch64Processors.td  |   6 +
 .../Atomics/aarch64-atomic-load-rcpc_immo.ll  | 144 ++++++++++++++----
 5 files changed, 127 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 0a91edb4c1661..20db70ee38572 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -805,10 +805,14 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
-def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
+def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
   "UseFixedOverScalableIfEqualCost", "true",
   "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
 
+// For performance reasons we prefer to use ldapr to ldapur on certain cores.
+def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true",
+  "Prefer add+ldapr to offset ldapur">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index de94cf64c9801..5e6db9d007a55 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -575,7 +575,7 @@ let Predicates = [HasRCPC3, HasNEON] in {
 }
 
 // v8.4a FEAT_LRCPC2 patterns
-let Predicates = [HasRCPC_IMMO] in {
+let Predicates = [HasRCPC_IMMO, UseLDAPUR] in {
   // Load-Acquire RCpc Register unscaled loads
   def : Pat<(acquiring_load<atomic_load_az_8>
                (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
@@ -589,7 +589,9 @@ let Predicates = [HasRCPC_IMMO] in {
   def : Pat<(acquiring_load<atomic_load_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (LDAPURXi GPR64sp:$Rn, simm9:$offset)>;
+}
 
+let Predicates = [HasRCPC_IMMO] in {
   // Store-Release Register unscaled stores
   def : Pat<(releasing_store<atomic_store_8>
                (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index fa6385409f30c..9d0bd44544134 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -389,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
+def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;
+
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 0e3c4e8397f52..8a2c0442a0c0d 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -240,6 +240,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeatureUseFixedOverScalableIfEqualCost,
+                               FeatureAvoidLDAPUR,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
@@ -250,6 +251,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeatureUseFixedOverScalableIfEqualCost,
+                               FeatureAvoidLDAPUR,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
@@ -260,6 +262,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
                                 FeatureUseFixedOverScalableIfEqualCost,
+                                FeatureAvoidLDAPUR,
                                 FeaturePredictableSelectIsExpensive]>;
 
 def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
@@ -540,6 +543,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeatureUseFixedOverScalableIfEqualCost,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
@@ -549,6 +553,7 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
                                       FeatureFuseAdrpAdd,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
@@ -558,6 +563,7 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
                                       FeatureFuseAdrpAdd,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
index 9687ba683fb7e..b475e68db411a 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "(?!^\s*lda.*\bsp\b)^\s*.*\bsp\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo,avoid-ldapur -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v2 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x4 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x925 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 
 define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_unordered:
@@ -39,8 +45,12 @@ define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_aligned_acquire:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -51,8 +61,12 @@ define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_aligned_acquire_const:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -113,8 +127,12 @@ define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #8
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_aligned_acquire:
-; SDAG:    ldapurh w0, [x0, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [x0, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #8
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -125,8 +143,12 @@ define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #8
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_aligned_acquire_const:
-; SDAG:    ldapurh w0, [x0, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [x0, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #8
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -183,16 +205,30 @@ define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i32_aligned_acquire:
-; CHECK:    ldapur w0, [x0, #16]
+; GISEL-LABEL: load_atomic_i32_aligned_acquire:
+; GISEL:    ldapur w0, [x0, #16]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [x0, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #16
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
 }
 
 define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i32_aligned_acquire_const:
-; CHECK:    ldapur w0, [x0, #16]
+; GISEL-LABEL: load_atomic_i32_aligned_acquire_const:
+; GISEL:    ldapur w0, [x0, #16]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [x0, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #16
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
@@ -249,16 +285,30 @@ define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i64_aligned_acquire:
-; CHECK:    ldapur x0, [x0, #32]
+; GISEL-LABEL: load_atomic_i64_aligned_acquire:
+; GISEL:    ldapur x0, [x0, #32]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [x0, #32]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #32
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
 }
 
 define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i64_aligned_acquire_const:
-; CHECK:    ldapur x0, [x0, #32]
+; GISEL-LABEL: load_atomic_i64_aligned_acquire_const:
+; GISEL:    ldapur x0, [x0, #32]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [x0, #32]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #32
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
@@ -387,8 +437,12 @@ define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_unaligned_acquire:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -399,8 +453,12 @@ define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -846,9 +904,14 @@ define i8 @load_atomic_i8_from_gep() {
 ; GISEL:    add x8, x8, #1
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapurb w0, [sp, #13]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [sp, #13]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    orr x8, x19, #0x1
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
   %a = alloca [3 x i8]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i8], ptr %a, i64 0, i64 1
@@ -862,9 +925,14 @@ define i16 @load_atomic_i16_from_gep() {
 ; GISEL:    add x8, x8, #2
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapurh w0, [sp, #10]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [sp, #10]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    orr x8, x19, #0x2
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
   %a = alloca [3 x i16]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i16], ptr %a, i64 0, i64 1
@@ -877,9 +945,14 @@ define i32 @load_atomic_i32_from_gep() {
 ; GISEL:    bl init
 ; GISEL:    ldapur w0, [x8, #4]
 ;
-; SDAG-LABEL: load_atomic_i32_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapur w0, [sp, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [sp, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    add x8, x19, #4
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
   %a = alloca [3 x i32]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i32], ptr %a, i64 0, i64 1
@@ -892,9 +965,14 @@ define i64 @load_atomic_i64_from_gep() {
 ; GISEL:    bl init
 ; GISEL:    ldapur x0, [x8, #8]
 ;
-; SDAG-LABEL: load_atomic_i64_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapur x0, [sp, #16]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [sp, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    add x8, x19, #8
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
   %a = alloca [3 x i64]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i64], ptr %a, i64 0, i64 1