diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index dcdd9f82cde8e..41641e08293ec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1679,6 +1679,11 @@ class TargetTransformInfo { false; ///< If op is an fp min/max, whether NaNs may be present. }; + /// \returns True if the targets prefers fixed width vectorization if the + /// loop vectorizer's cost-model assigns an equal cost to the fixed and + /// scalable version of the vectorized loop. + bool preferFixedOverScalableIfEqualCost() const; + /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; @@ -2149,6 +2154,7 @@ class TargetTransformInfo::Concept { virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual bool preferFixedOverScalableIfEqualCost() const = 0; virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, @@ -2882,6 +2888,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + bool preferFixedOverScalableIfEqualCost() const override { + return Impl.preferFixedOverScalableIfEqualCost(); + } bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 01624de190d51..75ccf3900829d 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -920,6 +920,8 @@ class TargetTransformInfoImplBase { return VF; } + bool preferFixedOverScalableIfEqualCost() const { return false; } + bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index c175d1737e54b..1ef0c6a3606fb 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1286,6 +1286,10 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF, return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +bool TargetTransformInfo::preferFixedOverScalableIfEqualCost() const { + return TTIImpl->preferFixedOverScalableIfEqualCost(); +} + bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index e523957afc25a..832e44fe117e2 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -355,6 +355,10 @@ def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE", // Armv9.0 Architecture Extensions //===----------------------------------------------------------------------===// +def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost", + "UseFixedOverScalableIfEqualCost", "true", + "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">; + def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 87927093a2c4c..71384a23c49af 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -525,6 +525,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2 FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureUseFixedOverScalableIfEqualCost, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3", diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 3eb9aa963c018..a9189fd53f40b 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -371,6 +371,10 @@ class AArch64TTIImpl : public BasicTTIImplBase { return TailFoldingStyle::DataWithoutLaneMask; } + bool preferFixedOverScalableIfEqualCost() const { + return ST->useFixedOverScalableIfEqualCost(); + } + bool preferPredicateOverEpilogue(TailFoldingInfo *TFI); bool supportsScalableVectors() const { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 5520baef7152d..00a1664ec1819 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4629,7 +4629,9 @@ bool LoopVectorizationPlanner::isMoreProfitable( // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable(); + bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() && + A.Width.isScalable() && !B.Width.isScalable(); + auto CmpFn = [PreferScalable](const InstructionCost &LHS, const InstructionCost &RHS) { return PreferScalable ? LHS <= RHS : LHS < RHS; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll new file mode 100644 index 0000000000000..41595cc7d8996 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/prefer-fixed-if-equal-to-scalable.ll @@ -0,0 +1,60 @@ +; RUN: opt -S < %s -passes=loop-vectorize -force-target-instruction-cost=1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "aarch64-unknown-linux-gnu" + +@a = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64 +@b = dso_local local_unnamed_addr global [32000 x float] zeroinitializer, align 64 + +define void @NeoverseV2() #0 { +; CHECK-LABEL: define void @NeoverseV2( +; CHECK: store <4 x float> +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 16000 + %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2 + store float %add, ptr %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 16000 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define void @GenericCPU() #1 { +; CHECK-LABEL: define void @GenericCPU( +; CHECK: store +; +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds [32000 x float], ptr @b, i64 0, i64 %indvars.iv + %1 = load float, ptr %arrayidx2, align 4 + %add = fadd fast float %1, %0 + %2 = add nuw nsw i64 %indvars.iv, 16000 + %arrayidx5 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %2 + store float %add, ptr %arrayidx5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 16000 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve,+sve2,+v9a" } +attributes #1 = { vscale_range(1,16) "target-cpu"="generic" "target-features"="+sve,+v9a" }