[AArch64] Enable the select optimize pass for AArch64

davemgreen · davemgreen · commit 16a72a0f8748 · 2022-12-03T16:08:58.000Z
This enabled the select optimize patch for ARM Out of order AArch64 cores. It is trying to solve a problem that is difficult for the compiler to fix. The criteria for when a csel is better or worse than a branch depends heavily on whether the branch is well predicted and the amount of ILP in the loop (as well as other criteria like the core in question and the relative performance of the branch predictor). The pass seems to do a decent job though, with the inner loop heuristics being well implemented and doing a better job than I had expected in general, even without PGO information. I've been doing quite a bit of benchmarking. The headline numbers are these for SPEC2017 on a Neoverse N1: 500.perlbench_r -0.12% 502.gcc_r 0.02% 505.mcf_r 6.02% 520.omnetpp_r 0.32% 523.xalancbmk_r 0.20% 525.x264_r 0.02% 531.deepsjeng_r 0.00% 541.leela_r -0.09% 548.exchange2_r 0.00% 557.xz_r -0.20% Running benchmarks with a combination of the llvm-test-suite plus several versions of SPEC gave between a 0.2% and 0.4% geomean improvement depending on the core/run. The instruction count went down by 0.1% too, which is a good sign, but the results can be a little noisy. Some issues from other benchmarks I had ran were improved in rGca78b5601466f8515f5f958ef8e63d787d9d812e. In summary well predicted branches will see in improvement, badly predicted branches may get worse, and on average performance seems to be a little better overall. This patch enables the pass for AArch64 under -O3 for cores that will benefit for it. i.e. not in-order cores that do not fit into the "Assume infinite resources that allow to fully exploit the available instruction-level parallelism" cost model. It uses a subtarget feature for specifying when the pass will be enabled, which I have enabled under cpu=generic as the performance increases for out of order cores seems larger than any decreases for inorder, which were minor. Differential Revision: https://reviews.llvm.org/D138990
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -805,6 +805,9 @@ class TargetTransformInfo {
   MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                bool IsZeroCmp) const;
 
+  /// Should the Select Optimization pass be enabled and ran.
+  bool enableSelectOptimize() const;
+
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -1683,6 +1686,7 @@ class TargetTransformInfo::Concept {
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual MemCmpExpansionOptions
   enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
+  virtual bool enableSelectOptimize() = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
@@ -2173,6 +2177,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
+  bool enableSelectOptimize() override {
+    return Impl.enableSelectOptimize();
+  }
   bool enableMaskedInterleavedAccessVectorization() override {
     return Impl.enableMaskedInterleavedAccessVectorization();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -359,6 +359,8 @@ class TargetTransformInfoImplBase {
     return {};
   }
 
+  bool enableSelectOptimize() const { return true; }
+
   bool enableInterleavedAccessVectorization() const { return false; }
 
   bool enableMaskedInterleavedAccessVectorization() const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -547,6 +547,10 @@ TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp);
 }
 
+bool TargetTransformInfo::enableSelectOptimize() const {
+  return TTIImpl->enableSelectOptimize();
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -243,6 +243,10 @@ bool SelectOptimize::runOnFunction(Function &F) {
     return false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  if (!TTI->enableSelectOptimize())
+    return false;
+
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
@@ -209,6 +209,10 @@ def FeaturePredictableSelectIsExpensive : SubtargetFeature<
     "predictable-select-expensive", "PredictableSelectIsExpensive", "true",
     "Prefer likely predicted branches over selects">;
 
+def FeatureEnableSelectOptimize : SubtargetFeature<
+    "enable-select-opt", "EnableSelectOptimize", "true",
+    "Enable the select optimize pass for select loop heuristics">;
+
 def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
     "HasCustomCheapAsMoveHandling", "true",
     "Use custom handling of cheap instructions">;
@@ -743,51 +747,59 @@ def TuneA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureFuseAdrpAdd,
                                    FeatureFuseLiterals,
                                    FeaturePostRAScheduler,
+                                   FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
 def TuneA65     : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
                                    "Cortex-A65 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAddress,
                                    FeatureFuseAdrpAdd,
-                                   FeatureFuseLiterals]>;
+                                   FeatureFuseLiterals,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA72     : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
                                    "Cortex-A72 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureFuseLiterals]>;
+                                   FeatureFuseLiterals,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", [
                                    FeatureFuseAES,
-                                   FeatureFuseAdrpAdd]>;
+                                   FeatureFuseAdrpAdd,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", [
                                    FeatureFuseAES,
-                                   FeatureFuseAdrpAdd]>;
+                                   FeatureFuseAdrpAdd,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast]>;
+                                   FeatureLSLFast,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    "Cortex-A77 ARM processors", [
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast]>;
+                                   FeatureLSLFast,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                "Cortex-A78 ARM processors", [
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
                                FeatureLSLFast,
-                               FeaturePostRAScheduler]>;
+                               FeaturePostRAScheduler,
+                               FeatureEnableSelectOptimize]>;
 
 def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 "CortexA78C",
@@ -796,23 +808,26 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
                                 FeatureLSLFast,
-                                FeaturePostRAScheduler]>;
+                                FeaturePostRAScheduler,
+                                FeatureEnableSelectOptimize]>;
 
 def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    "Cortex-A710 ARM processors", [
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
                                    FeatureLSLFast,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   FeatureEnableSelectOptimize]>;
 
 def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  "Cortex-A715 ARM processors", [
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
                                  FeatureLSLFast,
-                                 FeatureFuseAdrpAdd]>;
+                                 FeatureFuseAdrpAdd,
+                                 FeatureEnableSelectOptimize]>;
 
 def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
                                "CortexR82",
@@ -825,22 +840,25 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
                                   FeatureLSLFast,
-                                  FeaturePostRAScheduler]>;
+                                  FeaturePostRAScheduler,
+                                  FeatureEnableSelectOptimize]>;
 
 def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   "Cortex-X2 ARM processors", [
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
                                   FeatureLSLFast,
-                                  FeaturePostRAScheduler]>;
+                                  FeaturePostRAScheduler,
+                                  FeatureEnableSelectOptimize]>;
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
                                FeatureLSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
-                               FeaturePostRAScheduler]>;
+                               FeaturePostRAScheduler,
+                               FeatureEnableSelectOptimize]>;
 
 def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
                                  "Fujitsu A64FX processors", [
@@ -1024,34 +1042,39 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
-                                      FeaturePostRAScheduler]>;
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize]>;
 
 def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
-                                      FeaturePostRAScheduler]>;
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize]>;
 
 def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
-                                      FeaturePostRAScheduler]>;
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize]>;
 
 def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
                                       FeatureLSLFast,
-                                      FeaturePostRAScheduler]>;
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize]>;
 
 def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2",
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureLSLFast,
-                                      FeaturePostRAScheduler]>;
+                                      FeaturePostRAScheduler,
+                                      FeatureEnableSelectOptimize]>;
 
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    "Qualcomm Saphira processors", [
@@ -1262,7 +1285,8 @@ def ProcessorFeatures {
 // FeatureFuseAdrpAdd is enabled under Generic to allow linker merging
 // optimizations.
 def : ProcessorModel<"generic", CortexA55Model, ProcessorFeatures.Generic,
-                     [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler]>;
+                     [FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler,
+                      FeatureEnableSelectOptimize]>;
 def : ProcessorModel<"cortex-a35", CortexA53Model, ProcessorFeatures.A53,
                      [TuneA35]>;
 def : ProcessorModel<"cortex-a34", CortexA53Model, ProcessorFeatures.A53,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -131,6 +131,11 @@ static cl::opt<bool>
                  cl::desc("Enable optimizations on complex GEPs"),
                  cl::init(false));
 
+static cl::opt<bool>
+    EnableSelectOpt("aarch64-select-opt", cl::Hidden,
+                    cl::desc("Enable select to branch optimizations"),
+                    cl::init(true));
+
 static cl::opt<bool>
     BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
                      cl::desc("Relax out of range conditional branches"));
@@ -587,6 +592,9 @@ void AArch64PassConfig::addIRPasses() {
 
   TargetPassConfig::addIRPasses();
 
+  if (getOptLevel() == CodeGenOpt::Aggressive && EnableSelectOpt)
+    addPass(createSelectOptimizePass());
+
   addPass(createAArch64StackTaggingPass(
       /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -388,6 +388,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
                                        int64_t BaseOffset, bool HasBaseReg,
                                        int64_t Scale, unsigned AddrSpace) const;
   /// @}
+
+  bool enableSelectOptimize() { return ST->enableSelectOptimize(); }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -66,6 +66,10 @@
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       TLS Variable Hoist
+; CHECK-NEXT:       Lazy Branch Probability Analysis
+; CHECK-NEXT:       Lazy Block Frequency Analysis
+; CHECK-NEXT:       Optimization Remark Emitter
+; CHECK-NEXT:       Optimize selects
 ; CHECK-NEXT:     Stack Safety Analysis
 ; CHECK-NEXT:       FunctionPass Manager
 ; CHECK-NEXT:         Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll

Original file line number	Diff line number	Diff line change
`@@ -359,6 +359,8 @@ class TargetTransformInfoImplBase {`
`359`	`359`	`return {};`
`360`	`360`	`}`
`361`	`361`
	`362`	`+ bool enableSelectOptimize() const { return true; }`
	`363`	`+`
`362`	`364`	`bool enableInterleavedAccessVectorization() const { return false; }`
`363`	`365`
`364`	`366`	`bool enableMaskedInterleavedAccessVectorization() const { return false; }`
Original file line number	Diff line number	Diff line change
`@@ -547,6 +547,10 @@ TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {`
`547`	`547`	`return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp);`
`548`	`548`	`}`
`549`	`549`
	`550`	`+bool TargetTransformInfo::enableSelectOptimize() const {`
	`551`	`+ return TTIImpl->enableSelectOptimize();`
	`552`	`+}`
	`553`	`+`
`550`	`554`	`bool TargetTransformInfo::enableInterleavedAccessVectorization() const {`
`551`	`555`	`return TTIImpl->enableInterleavedAccessVectorization();`
`552`	`556`	`}`