[AArch64] Improve codegen of vectorised early exit loops

david-arm · david-arm · commit c01a65ffb365 · 2024-12-11T09:58:07.000Z
Once PR llvm#112138 lands we are able to start vectorising more loops that have uncountable early exits. The typical loop structure looks like this: vector.body: ... %pred = icmp eq <2 x ptr> %wide.load, %broadcast.splat ... %or.reduc = tail call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %pred) %iv.cmp = icmp eq i64 %index.next, 4 %exit.cond = or i1 %or.reduc, %iv.cmp br i1 %exit.cond, label %middle.split, label %vector.body middle.split: br i1 %or.reduc, label %found, label %notfound found: ret i64 1 notfound: ret i64 0 The problem with this is that %or.reduc is kept live after the loop, and since this is a boolean it typically requires making a copy of the condition code register. For AArch64 this requires an additional cset instruction, which is quite expensive for a typical find loop that only contains 6 or 7 instructions. This patch attempts to improve the codegen by sinking the reduction out of the loop to the location of it's user. It's a lot cheaper to keep the predicate alive if the type is legal and has lots of registers for it. There is a potential downside in that a little more work is required after the loop, but I believe this is worth it since we are likely to spend most of our time in the loop.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5092,18 +5092,41 @@ bool AArch64TTIImpl::isProfitableToSinkOperands(
     }
   }
 
-  // Sink vscales closer to uses for better isel
+  auto ShouldSinkCondition = [] (Value *Cond) -> bool {
+    auto *II = dyn_cast<IntrinsicInst>(Cond);
+    return II && II->getIntrinsicID() == Intrinsic::vector_reduce_or &&
+           isa<ScalableVectorType>(II->getOperand(0)->getType());
+  };
+
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
   case Instruction::Add:
   case Instruction::Sub:
+    // Sink vscales closer to uses for better isel
     for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
       if (shouldSinkVScale(I->getOperand(Op), Ops)) {
         Ops.push_back(&I->getOperandUse(Op));
         return true;
       }
     }
     break;
+  case Instruction::Select: {
+    if (!ShouldSinkCondition(I->getOperand(0)))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
+  case Instruction::Br: {
+    if (cast<BranchInst>(I)->isUnconditional())
+      return false;
+
+    if (!ShouldSinkCondition(cast<BranchInst>(I)->getCondition()))
+      return false;
+
+    Ops.push_back(&I->getOperandUse(0));
+    return true;
+  }
   default:
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/reduce-or-opt.ll b/llvm/test/CodeGen/AArch64/reduce-or-opt.ll
@@ -95,24 +95,24 @@ notfound:
 define i64 @select_or_reduce_nxv2i1(ptr nocapture noundef readonly %src) {
 ; CHECK-LABEL: select_or_reduce_nxv2i1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov x10, xzr
-; CHECK-NEXT:    neg x8, x9
-; CHECK-NEXT:    add x11, x8, #4
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    neg x10, x8
+; CHECK-NEXT:    add x10, x10, #4
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x10, lsl #3]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cset w8, ne
 ; CHECK-NEXT:    b.ne .LBB2_3
 ; CHECK-NEXT:  // %bb.2: // %vector.body
 ; CHECK-NEXT:    // in Loop: Header=BB2_1 Depth=1
-; CHECK-NEXT:    cmp x11, x10
-; CHECK-NEXT:    add x10, x10, x9
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    add x9, x9, x8
 ; CHECK-NEXT:    b.ne .LBB2_1
 ; CHECK-NEXT:  .LBB2_3: // %middle.split
-; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
 entry:
   %vscale = tail call i64 @llvm.vscale.i64()
@@ -147,15 +147,15 @@ define i64 @br_or_reduce_nxv2i1(ptr nocapture noundef readonly %src, ptr noundef
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    cset w11, ne
 ; CHECK-NEXT:    b.ne .LBB3_3
 ; CHECK-NEXT:  // %bb.2: // %vector.body
 ; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
 ; CHECK-NEXT:    cmp x10, x9
 ; CHECK-NEXT:    add x9, x9, x8
 ; CHECK-NEXT:    b.ne .LBB3_1
 ; CHECK-NEXT:  .LBB3_3: // %middle.split
-; CHECK-NEXT:    tbz w11, #0, .LBB3_5
+; CHECK-NEXT:    ptest p0, p1.b
+; CHECK-NEXT:    b.eq .LBB3_5
 ; CHECK-NEXT:  // %bb.4: // %found
 ; CHECK-NEXT:    mov w8, #56 // =0x38
 ; CHECK-NEXT:    mov w0, #1 // =0x1