Skip to content

Commit 16a72a0

Browse files
committed
[AArch64] Enable the select optimize pass for AArch64
This enabled the select optimize patch for ARM Out of order AArch64 cores. It is trying to solve a problem that is difficult for the compiler to fix. The criteria for when a csel is better or worse than a branch depends heavily on whether the branch is well predicted and the amount of ILP in the loop (as well as other criteria like the core in question and the relative performance of the branch predictor). The pass seems to do a decent job though, with the inner loop heuristics being well implemented and doing a better job than I had expected in general, even without PGO information. I've been doing quite a bit of benchmarking. The headline numbers are these for SPEC2017 on a Neoverse N1: 500.perlbench_r -0.12% 502.gcc_r 0.02% 505.mcf_r 6.02% 520.omnetpp_r 0.32% 523.xalancbmk_r 0.20% 525.x264_r 0.02% 531.deepsjeng_r 0.00% 541.leela_r -0.09% 548.exchange2_r 0.00% 557.xz_r -0.20% Running benchmarks with a combination of the llvm-test-suite plus several versions of SPEC gave between a 0.2% and 0.4% geomean improvement depending on the core/run. The instruction count went down by 0.1% too, which is a good sign, but the results can be a little noisy. Some issues from other benchmarks I had ran were improved in rGca78b5601466f8515f5f958ef8e63d787d9d812e. In summary well predicted branches will see in improvement, badly predicted branches may get worse, and on average performance seems to be a little better overall. This patch enables the pass for AArch64 under -O3 for cores that will benefit for it. i.e. not in-order cores that do not fit into the "Assume infinite resources that allow to fully exploit the available instruction-level parallelism" cost model. It uses a subtarget feature for specifying when the pass will be enabled, which I have enabled under cpu=generic as the performance increases for out of order cores seems larger than any decreases for inorder, which were minor. Differential Revision: https://reviews.llvm.org/D138990
1 parent 2f999cc commit 16a72a0

File tree

9 files changed

+337
-19
lines changed

9 files changed

+337
-19
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,6 +805,9 @@ class TargetTransformInfo {
805805
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
806806
bool IsZeroCmp) const;
807807

808+
/// Should the Select Optimization pass be enabled and ran.
809+
bool enableSelectOptimize() const;
810+
808811
/// Enable matching of interleaved access groups.
809812
bool enableInterleavedAccessVectorization() const;
810813

@@ -1683,6 +1686,7 @@ class TargetTransformInfo::Concept {
16831686
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
16841687
virtual MemCmpExpansionOptions
16851688
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
1689+
virtual bool enableSelectOptimize() = 0;
16861690
virtual bool enableInterleavedAccessVectorization() = 0;
16871691
virtual bool enableMaskedInterleavedAccessVectorization() = 0;
16881692
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
@@ -2173,6 +2177,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
21732177
bool enableInterleavedAccessVectorization() override {
21742178
return Impl.enableInterleavedAccessVectorization();
21752179
}
2180+
bool enableSelectOptimize() override {
2181+
return Impl.enableSelectOptimize();
2182+
}
21762183
bool enableMaskedInterleavedAccessVectorization() override {
21772184
return Impl.enableMaskedInterleavedAccessVectorization();
21782185
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,8 @@ class TargetTransformInfoImplBase {
359359
return {};
360360
}
361361

362+
bool enableSelectOptimize() const { return true; }
363+
362364
bool enableInterleavedAccessVectorization() const { return false; }
363365

364366
bool enableMaskedInterleavedAccessVectorization() const { return false; }

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,10 @@ TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
547547
return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp);
548548
}
549549

550+
bool TargetTransformInfo::enableSelectOptimize() const {
551+
return TTIImpl->enableSelectOptimize();
552+
}
553+
550554
bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
551555
return TTIImpl->enableInterleavedAccessVectorization();
552556
}

llvm/lib/CodeGen/SelectOptimize.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ bool SelectOptimize::runOnFunction(Function &F) {
243243
return false;
244244

245245
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
246+
247+
if (!TTI->enableSelectOptimize())
248+
return false;
249+
246250
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
247251
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
248252
BPI.reset(new BranchProbabilityInfo(F, *LI));

llvm/lib/Target/AArch64/AArch64.td

Lines changed: 43 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,10 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<
209209
"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
210210
"Prefer likely predicted branches over selects">;
211211

212+
def FeatureEnableSelectOptimize : SubtargetFeature<
213+
"enable-select-opt", "EnableSelectOptimize", "true",
214+
"Enable the select optimize pass for select loop heuristics">;
215+
212216
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
213217
"HasCustomCheapAsMoveHandling", "true",
214218
"Use custom handling of cheap instructions">;
@@ -743,51 +747,59 @@ def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
743747
FeatureFuseAdrpAdd,
744748
FeatureFuseLiterals,
745749
FeaturePostRAScheduler,
750+
FeatureEnableSelectOptimize,
746751
FeaturePredictableSelectIsExpensive]>;
747752

748753
def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
749754
"Cortex-A65 ARM processors", [
750755
FeatureFuseAES,
751756
FeatureFuseAddress,
752757
FeatureFuseAdrpAdd,
753-
FeatureFuseLiterals]>;
758+
FeatureFuseLiterals,
759+
FeatureEnableSelectOptimize]>;
754760

755761
def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
756762
"Cortex-A72 ARM processors", [
757763
FeatureFuseAES,
758764
FeatureFuseAdrpAdd,
759-
FeatureFuseLiterals]>;
765+
FeatureFuseLiterals,
766+
FeatureEnableSelectOptimize]>;
760767

761768
def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
762769
"Cortex-A73 ARM processors", [
763770
FeatureFuseAES,
764-
FeatureFuseAdrpAdd]>;
771+
FeatureFuseAdrpAdd,
772+
FeatureEnableSelectOptimize]>;
765773

766774
def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
767775
"Cortex-A75 ARM processors", [
768776
FeatureFuseAES,
769-
FeatureFuseAdrpAdd]>;
777+
FeatureFuseAdrpAdd,
778+
FeatureEnableSelectOptimize]>;
770779

771780
def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
772781
"Cortex-A76 ARM processors", [
773782
FeatureFuseAES,
774783
FeatureFuseAdrpAdd,
775-
FeatureLSLFast]>;
784+
FeatureLSLFast,
785+
FeatureEnableSelectOptimize]>;
776786

777787
def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
778788
"Cortex-A77 ARM processors", [
779789
FeatureCmpBccFusion,
780790
FeatureFuseAES,
781791
FeatureFuseAdrpAdd,
782-
FeatureLSLFast]>;
792+
FeatureLSLFast,
793+
FeatureEnableSelectOptimize]>;
783794

784795
def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
785796
"Cortex-A78 ARM processors", [
786797
FeatureCmpBccFusion,
787798
FeatureFuseAES,
788799
FeatureFuseAdrpAdd,
789800
FeatureLSLFast,
790-
FeaturePostRAScheduler]>;
801+
FeaturePostRAScheduler,
802+
FeatureEnableSelectOptimize]>;
791803

792804
def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
793805
"CortexA78C",
@@ -796,23 +808,26 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
796808
FeatureFuseAES,
797809
FeatureFuseAdrpAdd,
798810
FeatureLSLFast,
799-
FeaturePostRAScheduler]>;
811+
FeaturePostRAScheduler,
812+
FeatureEnableSelectOptimize]>;
800813

801814
def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
802815
"Cortex-A710 ARM processors", [
803816
FeatureCmpBccFusion,
804817
FeatureFuseAES,
805818
FeatureFuseAdrpAdd,
806819
FeatureLSLFast,
807-
FeaturePostRAScheduler]>;
820+
FeaturePostRAScheduler,
821+
FeatureEnableSelectOptimize]>;
808822

809823
def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
810824
"Cortex-A715 ARM processors", [
811825
FeatureFuseAES,
812826
FeaturePostRAScheduler,
813827
FeatureCmpBccFusion,
814828
FeatureLSLFast,
815-
FeatureFuseAdrpAdd]>;
829+
FeatureFuseAdrpAdd,
830+
FeatureEnableSelectOptimize]>;
816831

817832
def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
818833
"CortexR82",
@@ -825,22 +840,25 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
825840
FeatureFuseAES,
826841
FeatureFuseAdrpAdd,
827842
FeatureLSLFast,
828-
FeaturePostRAScheduler]>;
843+
FeaturePostRAScheduler,
844+
FeatureEnableSelectOptimize]>;
829845

830846
def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
831847
"Cortex-X2 ARM processors", [
832848
FeatureCmpBccFusion,
833849
FeatureFuseAES,
834850
FeatureFuseAdrpAdd,
835851
FeatureLSLFast,
836-
FeaturePostRAScheduler]>;
852+
FeaturePostRAScheduler,
853+
FeatureEnableSelectOptimize]>;
837854

838855
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
839856
"Cortex-X3 ARM processors", [
840857
FeatureLSLFast,
841858
FeatureFuseAdrpAdd,
842859
FeatureFuseAES,
843-
FeaturePostRAScheduler]>;
860+
FeaturePostRAScheduler,
861+
FeatureEnableSelectOptimize]>;
844862

845863
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
846864
"Fujitsu A64FX processors", [
@@ -1024,34 +1042,39 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
10241042
FeatureFuseAES,
10251043
FeatureFuseAdrpAdd,
10261044
FeatureLSLFast,
1027-
FeaturePostRAScheduler]>;
1045+
FeaturePostRAScheduler,
1046+
FeatureEnableSelectOptimize]>;
10281047

10291048
def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
10301049
"Neoverse N2 ARM processors", [
10311050
FeatureFuseAES,
10321051
FeatureFuseAdrpAdd,
10331052
FeatureLSLFast,
1034-
FeaturePostRAScheduler]>;
1053+
FeaturePostRAScheduler,
1054+
FeatureEnableSelectOptimize]>;
10351055

10361056
def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
10371057
"Neoverse 512-TVB ARM processors", [
10381058
FeatureFuseAES,
10391059
FeatureFuseAdrpAdd,
10401060
FeatureLSLFast,
1041-
FeaturePostRAScheduler]>;
1061+
FeaturePostRAScheduler,
1062+
FeatureEnableSelectOptimize]>;
10421063

10431064
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
10441065
"Neoverse V1 ARM processors", [
10451066
FeatureFuseAES,
10461067
FeatureFuseAdrpAdd,
10471068
FeatureLSLFast,
1048-
FeaturePostRAScheduler]>;
1069+
FeaturePostRAScheduler,
1070+
FeatureEnableSelectOptimize]>;
10491071

10501072
def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2",
10511073
"Neoverse V2 ARM processors", [
10521074
FeatureFuseAES,
10531075
FeatureLSLFast,
1054-
FeaturePostRAScheduler]>;
1076+
FeaturePostRAScheduler,
1077+
FeatureEnableSelectOptimize]>;
10551078

10561079
def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
10571080
"Qualcomm Saphira processors", [
@@ -1262,7 +1285,8 @@ def ProcessorFeatures {
12621285
// FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
12631286
// optimizations.
12641287
def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
1265-
[FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>;
1288+
[FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler,
1289+
FeatureEnableSelectOptimize]>;
12661290
def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
12671291
[TuneA35]>;
12681292
def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,

llvm/lib/Target/AArch64/AArch64TargetMachine.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ static cl::opt<bool>
131131
cl::desc("Enable optimizations on complex GEPs"),
132132
cl::init(false));
133133

134+
static cl::opt<bool>
135+
EnableSelectOpt("aarch64-select-opt", cl::Hidden,
136+
cl::desc("Enable select to branch optimizations"),
137+
cl::init(true));
138+
134139
static cl::opt<bool>
135140
BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
136141
cl::desc("Relax out of range conditional branches"));
@@ -587,6 +592,9 @@ void AArch64PassConfig::addIRPasses() {
587592

588593
TargetPassConfig::addIRPasses();
589594

595+
if (getOptLevel() == CodeGenOpt::Aggressive && EnableSelectOpt)
596+
addPass(createSelectOptimizePass());
597+
590598
addPass(createAArch64StackTaggingPass(
591599
/*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
592600

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
388388
int64_t BaseOffset, bool HasBaseReg,
389389
int64_t Scale, unsigned AddrSpace) const;
390390
/// @}
391+
392+
bool enableSelectOptimize() { return ST->enableSelectOptimize(); }
391393
};
392394

393395
} // end namespace llvm

llvm/test/CodeGen/AArch64/O3-pipeline.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,10 @@
6666
; CHECK-NEXT: Expand reduction intrinsics
6767
; CHECK-NEXT: Natural Loop Information
6868
; CHECK-NEXT: TLS Variable Hoist
69+
; CHECK-NEXT: Lazy Branch Probability Analysis
70+
; CHECK-NEXT: Lazy Block Frequency Analysis
71+
; CHECK-NEXT: Optimization Remark Emitter
72+
; CHECK-NEXT: Optimize selects
6973
; CHECK-NEXT: Stack Safety Analysis
7074
; CHECK-NEXT: FunctionPass Manager
7175
; CHECK-NEXT: Dominator Tree Construction

0 commit comments

Comments
 (0)