Merge branch 'main' into dynamicextent_keep_live

balazske · web-flow · commit 0041f31fb1d1 · 2025-11-17T10:06:39.000+01:00
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -77,6 +77,10 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
     "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
     cl::desc("The number of instructions to search for a redundant dmb"));
 
+static cl::opt<int> Aarch64ForceUnrollThreshold(
+    "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
+    cl::desc("Threshold for forced unrolling of small loops in AArch64"));
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -4163,12 +4167,15 @@ InstructionCost AArch64TTIImpl::getScalarizationOverhead(
 
 std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
     Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
-    TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
+    TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
     std::function<InstructionCost(Type *)> InstCost) const {
   if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
     return std::nullopt;
   if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
     return std::nullopt;
+  if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
+      ST->isNonStreamingSVEorSME2Available())
+    return std::nullopt;
 
   Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
   InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
@@ -4210,6 +4217,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
       ISD == ISD::FDIV || ISD == ISD::FREM)
     if (auto PromotedCost = getFP16BF16PromoteCost(
             Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
+            // There is not native support for fdiv/frem even with +sve-b16b16.
+            /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
             [&](Type *PromotedTy) {
               return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
                                             Op1Info, Op2Info);
@@ -4624,7 +4633,8 @@ InstructionCost AArch64TTIImpl::getCmpSelInstrCost(
   if (Opcode == Instruction::FCmp) {
     if (auto PromotedCost = getFP16BF16PromoteCost(
             ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
-            [&](Type *PromotedTy) {
+            // TODO: Consider costing SVE FCMPs.
+            /*CanUseSVE=*/false, [&](Type *PromotedTy) {
               InstructionCost Cost =
                   getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
                                      CostKind, Op1Info, Op2Info);
@@ -5250,6 +5260,7 @@ void AArch64TTIImpl::getUnrollingPreferences(
   // inlining. Don't unroll auto-vectorized loops either, though do allow
   // unrolling of the scalar remainder.
   bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
+  InstructionCost Cost = 0;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
       // Both auto-vectorized loops and the scalar remainder have the
@@ -5264,6 +5275,10 @@ void AArch64TTIImpl::getUnrollingPreferences(
               continue;
         return;
       }
+
+      SmallVector<const Value *, 4> Operands(I.operand_values());
+      Cost += getInstructionCost(&I, Operands,
+                                 TargetTransformInfo::TCK_SizeAndLatency);
     }
   }
 
@@ -5310,6 +5325,11 @@ void AArch64TTIImpl::getUnrollingPreferences(
     UP.UnrollAndJam = true;
     UP.UnrollAndJamInnerLoopThreshold = 60;
   }
+
+  // Force unrolling small loops can be very useful because of the branch
+  // taken cost of the backedge.
+  if (Cost < Aarch64ForceUnrollThreshold)
+    UP.Force = true;
 }
 
 void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -456,11 +456,10 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
 
   /// FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the
   /// architecture features are not present.
-  std::optional<InstructionCost>
-  getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind,
-                         TTI::OperandValueInfo Op1Info,
-                         TTI::OperandValueInfo Op2Info, bool IncludeTrunc,
-                         std::function<InstructionCost(Type *)> InstCost) const;
+  std::optional<InstructionCost> getFP16BF16PromoteCost(
+      Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info,
+      TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
+      std::function<InstructionCost(Type *)> InstCost) const;
 
   InstructionCost
   getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -33,11 +33,17 @@ define void @fadd() {
 }
 
 define void @fadd_bf16() {
-; CHECK-LABEL: 'fadd_bf16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-BASE-LABEL: 'fadd_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fadd_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fadd <vscale x 16 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %NXV4BF16 = fadd <vscale x 4 x bfloat> poison, poison
   %NXV8BF16 = fadd <vscale x 8 x bfloat> poison, poison
@@ -76,11 +82,17 @@ define void @fsub() {
 }
 
 define void @fsub_bf16() {
-; CHECK-LABEL: 'fsub_bf16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-BASE-LABEL: 'fsub_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:27 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fsub_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fsub <vscale x 16 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %NXV4BF16 = fsub <vscale x 4 x bfloat> poison, poison
   %NXV8BF16 = fsub <vscale x 8 x bfloat> poison, poison
@@ -160,11 +172,17 @@ define void @fmul() {
 }
 
 define void @fmul_bf16() {
-; CHECK-LABEL: 'fmul_bf16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
-; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-BASE-LABEL: 'fmul_bf16'
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:58 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; CHECK-BASE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-BF16-LABEL: 'fmul_bf16'
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %NXV16BF16 = fmul <vscale x 16 x bfloat> poison, poison
+; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %NXV4BF16 = fmul <vscale x 4 x bfloat> poison, poison
   %NXV8BF16 = fmul <vscale x 8 x bfloat> poison, poison
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll b/llvm/test/Transforms/LoopUnroll/AArch64/force-unroll-threshold.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-unroll -S -unroll-runtime %s | FileCheck %s --check-prefix=NOFORCE
+; RUN: opt -passes=loop-unroll -S -unroll-runtime -aarch64-force-unroll-threshold=500 %s | FileCheck %s --check-prefix=FORCE
+
+; The loop has a small runtime upper bound (at most four iterations) but a
+; relatively expensive body. With runtime unrolling enabled, the cost model
+; still leaves the loop rolled. Raising the AArch64 force threshold overrides
+; that decision and unrolls.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @force_small_loop(ptr nocapture %a, ptr nocapture %b, i32 %n) {
+entry:
+  br label %loop
+
+; NOFORCE-LABEL: @force_small_loop(
+; NOFORCE:       loop:
+; NOFORCE:         br i1 %cond, label %body, label %exit
+; NOFORCE:       body:
+; NOFORCE:         store i32 %mix15, ptr %ptrb, align 4
+; NOFORCE:       latch:
+; NOFORCE:         br i1 %cmp2, label %loop, label %exit
+; NOFORCE:       ret void
+; NOFORCE-NOT:   loop.1:
+;
+; FORCE-LABEL: @force_small_loop(
+; FORCE:       loop:
+; FORCE:         br i1 %cond, label %body, label %exit
+; FORCE:       loop.1:
+; FORCE:         br i1 true, label %body.1, label %exit
+; FORCE:       body.1:
+; FORCE:         store i32 %mix15.1, ptr %ptrb.1, align 4
+; FORCE:       latch.1:
+; FORCE:         br i1 %cmp2.1, label %loop, label %exit
+; FORCE:       ret void
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %ptra = getelementptr inbounds i32, ptr %a, i32 %i
+  %pa = load i32, ptr %ptra, align 4
+  %tmp0 = mul nsw i32 %pa, %pa
+  %tmp1 = add nsw i32 %tmp0, %pa
+  %tmp2 = shl i32 %tmp1, 1
+  %tmp3 = ashr i32 %tmp2, 1
+  %tmp4 = xor i32 %tmp3, %pa
+  %tmp5 = add nsw i32 %tmp4, 7
+  %tmp6 = mul nsw i32 %tmp5, 5
+  %tmp7 = add nsw i32 %tmp6, %tmp4
+  %tmp8 = mul nsw i32 %tmp7, %tmp3
+  %tmp9 = add nsw i32 %tmp8, %tmp7
+  %tmp10 = xor i32 %tmp9, %tmp6
+  %tmp11 = add nsw i32 %tmp10, %tmp8
+  %tmp12 = mul nsw i32 %tmp11, 9
+  %tmp13 = add nsw i32 %tmp12, %tmp10
+  %tmp14 = xor i32 %tmp13, %tmp11
+  %cond = icmp ult i32 %i, %n
+  br i1 %cond, label %body, label %exit
+
+body:
+  %ptrb = getelementptr inbounds i32, ptr %b, i32 %i
+  %pb = load i32, ptr %ptrb, align 4
+  %sum = add nsw i32 %pb, %tmp14
+  %diff = sub nsw i32 %sum, %pa
+  %mix1 = mul nsw i32 %diff, 3
+  %mix2 = add nsw i32 %mix1, %tmp3
+  %mix3 = xor i32 %mix2, %diff
+  %mix4 = add nsw i32 %mix3, %tmp0
+  %mix5 = mul nsw i32 %mix4, 11
+  %mix6 = add nsw i32 %mix5, %mix2
+  %mix7 = xor i32 %mix6, %mix5
+  %mix8 = add nsw i32 %mix7, %mix3
+  %mix9 = mul nsw i32 %mix8, 13
+  %mix10 = add nsw i32 %mix9, %mix8
+  %mix11 = xor i32 %mix10, %mix7
+  %mix12 = add nsw i32 %mix11, %mix6
+  %mix13 = mul nsw i32 %mix12, 17
+  %mix14 = add nsw i32 %mix13, %mix9
+  %mix15 = xor i32 %mix14, %mix10
+  store i32 %mix15, ptr %ptrb, align 4
+  br label %latch
+
+latch:
+  %inc = add nuw nsw i32 %i, 1
+  %cmp.limit = icmp ult i32 %n, 4
+  %upper = select i1 %cmp.limit, i32 %n, i32 4
+  %cmp2 = icmp ult i32 %inc, %upper
+  br i1 %cmp2, label %loop, label %exit
+
+exit:
+  ret void
+}