From b3f6eb4db6b0cb39a1071800a3ced61b960c7d34 Mon Sep 17 00:00:00 2001 From: David Green Date: Mon, 27 Jan 2025 13:10:14 +0000 Subject: [PATCH] [AArch64] Avoid generating LDAPUR on certain cores On the CPUs listed below, we want to avoid LDAPUR for performance reasons. Add a tuning feature to disable them when using: -mcpu=neoverse-v2 -mcpu=neoverse-v3 -mcpu=cortex-x3 -mcpu=cortex-x4 -mcpu=cortex-x925 --- llvm/lib/Target/AArch64/AArch64Features.td | 6 +- .../lib/Target/AArch64/AArch64InstrAtomics.td | 4 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 2 + llvm/lib/Target/AArch64/AArch64Processors.td | 6 + .../Atomics/aarch64-atomic-load-rcpc_immo.ll | 144 ++++++++++++++---- 5 files changed, 127 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 0a91edb4c1661..20db70ee38572 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -805,10 +805,14 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly", "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">; -def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", +def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", "UseFixedOverScalableIfEqualCost", "true", "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">; +// For performance reasons we prefer to use ldapr to ldapur on certain cores. +def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true", + "Prefer add+ldapr to offset ldapur">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index de94cf64c9801..5e6db9d007a55 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -575,7 +575,7 @@ let Predicates = [HasRCPC3, HasNEON] in { } // v8.4a FEAT_LRCPC2 patterns -let Predicates = [HasRCPC_IMMO] in { +let Predicates = [HasRCPC_IMMO, UseLDAPUR] in { // Load-Acquire RCpc Register unscaled loads def : Pat<(acquiring_load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)), @@ -589,7 +589,9 @@ let Predicates = [HasRCPC_IMMO] in { def : Pat<(acquiring_load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)), (LDAPURXi GPR64sp:$Rn, simm9:$offset)>; +} +let Predicates = [HasRCPC_IMMO] in { // Store-Release Register unscaled stores def : Pat<(releasing_store (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val), diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index fa6385409f30c..9d0bd44544134 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -389,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">; def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">; +def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">; + def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisInt<1>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 0e3c4e8397f52..8a2c0442a0c0d 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -240,6 +240,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", @@ -250,6 +251,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", @@ -260,6 +262,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily", FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", @@ -540,6 +543,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeatureUseFixedOverScalableIfEqualCost, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", @@ -549,6 +553,7 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3 FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3", @@ -558,6 +563,7 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover FeatureFuseAdrpAdd, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureAvoidLDAPUR, FeaturePredictableSelectIsExpensive]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll index 9687ba683fb7e..b475e68db411a 100644 --- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll +++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll @@ -1,6 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "(?!^\s*lda.*\bsp\b)^\s*.*\bsp\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)" ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL -; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo,avoid-ldapur -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v2 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x4 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR +; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x925 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) { ; CHECK-LABEL: load_atomic_i8_aligned_unordered: @@ -39,8 +45,12 @@ define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_aligned_acquire: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -51,8 +61,12 @@ define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_aligned_acquire_const: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -113,8 +127,12 @@ define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #8 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_aligned_acquire: -; SDAG: ldapurh w0, [x0, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [x0, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #8 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r @@ -125,8 +143,12 @@ define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #8 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_aligned_acquire_const: -; SDAG: ldapurh w0, [x0, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [x0, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #8 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %gep = getelementptr inbounds i16, ptr %ptr, i32 4 %r = load atomic i16, ptr %gep acquire, align 2 ret i16 %r @@ -183,16 +205,30 @@ define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) { } define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i32_aligned_acquire: -; CHECK: ldapur w0, [x0, #16] +; GISEL-LABEL: load_atomic_i32_aligned_acquire: +; GISEL: ldapur w0, [x0, #16] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapur w0, [x0, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #16 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %r = load atomic i32, ptr %gep acquire, align 4 ret i32 %r } define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i32_aligned_acquire_const: -; CHECK: ldapur w0, [x0, #16] +; GISEL-LABEL: load_atomic_i32_aligned_acquire_const: +; GISEL: ldapur w0, [x0, #16] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapur w0, [x0, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #16 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %r = load atomic i32, ptr %gep acquire, align 4 ret i32 %r @@ -249,16 +285,30 @@ define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) { } define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) { -; CHECK-LABEL: load_atomic_i64_aligned_acquire: -; CHECK: ldapur x0, [x0, #32] +; GISEL-LABEL: load_atomic_i64_aligned_acquire: +; GISEL: ldapur x0, [x0, #32] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapur x0, [x0, #32] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #32 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %r = load atomic i64, ptr %gep acquire, align 8 ret i64 %r } define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) { -; CHECK-LABEL: load_atomic_i64_aligned_acquire_const: -; CHECK: ldapur x0, [x0, #32] +; GISEL-LABEL: load_atomic_i64_aligned_acquire_const: +; GISEL: ldapur x0, [x0, #32] +; +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapur x0, [x0, #32] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #32 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %r = load atomic i64, ptr %gep acquire, align 8 ret i64 %r @@ -387,8 +437,12 @@ define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_unaligned_acquire: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -399,8 +453,12 @@ define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) { ; GISEL: add x8, x0, #4 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const: -; SDAG: ldapurb w0, [x0, #4] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const: +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [x0, #4] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const: +; SDAG-AVOIDLDAPUR: add x8, x0, #4 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %gep = getelementptr inbounds i8, ptr %ptr, i32 4 %r = load atomic i8, ptr %gep acquire, align 1 ret i8 %r @@ -846,9 +904,14 @@ define i8 @load_atomic_i8_from_gep() { ; GISEL: add x8, x8, #1 ; GISEL: ldaprb w0, [x8] ; -; SDAG-LABEL: load_atomic_i8_from_gep: -; SDAG: bl init -; SDAG: ldapurb w0, [sp, #13] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapurb w0, [sp, #13] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: orr x8, x19, #0x1 +; SDAG-AVOIDLDAPUR: ldaprb w0, [x8] %a = alloca [3 x i8] call void @init(ptr %a) %arrayidx = getelementptr [3 x i8], ptr %a, i64 0, i64 1 @@ -862,9 +925,14 @@ define i16 @load_atomic_i16_from_gep() { ; GISEL: add x8, x8, #2 ; GISEL: ldaprh w0, [x8] ; -; SDAG-LABEL: load_atomic_i16_from_gep: -; SDAG: bl init -; SDAG: ldapurh w0, [sp, #10] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapurh w0, [sp, #10] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: orr x8, x19, #0x2 +; SDAG-AVOIDLDAPUR: ldaprh w0, [x8] %a = alloca [3 x i16] call void @init(ptr %a) %arrayidx = getelementptr [3 x i16], ptr %a, i64 0, i64 1 @@ -877,9 +945,14 @@ define i32 @load_atomic_i32_from_gep() { ; GISEL: bl init ; GISEL: ldapur w0, [x8, #4] ; -; SDAG-LABEL: load_atomic_i32_from_gep: -; SDAG: bl init -; SDAG: ldapur w0, [sp, #8] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapur w0, [sp, #8] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: add x8, x19, #4 +; SDAG-AVOIDLDAPUR: ldapr w0, [x8] %a = alloca [3 x i32] call void @init(ptr %a) %arrayidx = getelementptr [3 x i32], ptr %a, i64 0, i64 1 @@ -892,9 +965,14 @@ define i64 @load_atomic_i64_from_gep() { ; GISEL: bl init ; GISEL: ldapur x0, [x8, #8] ; -; SDAG-LABEL: load_atomic_i64_from_gep: -; SDAG: bl init -; SDAG: ldapur x0, [sp, #16] +; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_from_gep: +; SDAG-NOAVOIDLDAPUR: bl init +; SDAG-NOAVOIDLDAPUR: ldapur x0, [sp, #16] +; +; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_from_gep: +; SDAG-AVOIDLDAPUR: bl init +; SDAG-AVOIDLDAPUR: add x8, x19, #8 +; SDAG-AVOIDLDAPUR: ldapr x0, [x8] %a = alloca [3 x i64] call void @init(ptr %a) %arrayidx = getelementptr [3 x i64], ptr %a, i64 0, i64 1