From 8397d3f2565afbbb3d36045433a3fa673745755b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 15 Jul 2025 19:01:16 +0800
Subject: [PATCH] [VPlan] Fold safe divisors into VP intrinsics with EVL

If a udiv/sdiv/urem/srem needs predicated, either because of control flow in the loop or because of tail folding, then we replace any of the masked off lanes with a safe divisor, i.e. 1, with a select.

For EVL tail folding we can optimize away the select and use a VP intrinsic directly, which helps cases in SPEC 525.x264_r and llvm-test-suite on RISC-V e.g.:

```diff
-       vmv.v.i v9, 1
        lui     a2, 4
-       vmv.v.x v10, a2
+       vmv.v.x v9, a2
 .Lpcrel_hi387:
        auipc   a2, %pcrel_hi(_ZL2dt)
        addi    a2, a2, %pcrel_lo(.Lpcrel_hi387)
 .LBB75_4:                               # %vector.body
                                         # =>This Inner Loop Header: Depth=1
        sub     a3, a1, a0
-       vmv1r.v v11, v9
        vsetvli a3, a3, e16, m1, ta, ma
-       vadd.vv v12, v8, v8
-       vsetvli zero, zero, e16, m1, tu, ma
-       vadd.vi v11, v12, 3
-       vsetvli zero, zero, e16, m1, ta, ma
-       vdivu.vv        v11, v10, v11
+       vadd.vv v10, v8, v8
+       vadd.vi v10, v10, 3
+       vdivu.vv        v12, v9, v10
        sh2add  a4, a0, a2
        add     a0, a0, a3
        vsetvli zero, zero, e32, m2, ta, ma
-       vzext.vf2       v12, v11
-       vse32.v v12, (a4)
+       vzext.vf2       v10, v12
+       vse32.v v10, (a4)
```

It's tempting to try and fold away any arbitrary mask/EVL combination feeding into a divisor operand, but care needs to be taken as this transform actually replaces lanes that were previously defined with poison. So we can only do this with the EVL-based IV and an all-ones mask, where we know that the lanes past EVL aren't used.

This is also the reason why I chose to do this as a VPlan transform as opposed to e.g. RISCVCodeGenPrepare, because it's much harder to figure out the EVL-based IV again outside of VPlan.

Also worth noting is that we still avoid transforming to VP intrinsics for non-trapping recipes. We only need to handle these cases due to their trapping behaviour.

I looked into whether or not we could make the "safe-divisor" case more recognisable, by either adding a new recipe or a new VPInstruction to represent a divisor that's safe to be folded into a div and produce poison, but this made things more complicated in the end.

Fixes #129538
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 48 +++++++++++++++++++
 .../vectorize-force-tail-with-evl-div.ll      | 12 ++---
 2 files changed, 52 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 6a3b3e6e41955..664e42b7e3318 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2176,6 +2176,52 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
       .Default([&](VPRecipeBase *R) { return nullptr; });
 }
 
+/// Try to optimize safe divisors away by converting their users to VP
+/// intrinsics:
+///
+/// udiv x, (vp.merge allones, y, 1, evl) -> vp.udiv x, y, allones, evl
+///
+/// Note the lanes past EVL will be changed from x to poison. This only works
+/// for the EVL-based IV and not any arbitrary EVL, because we know nothing
+/// will read the lanes past the EVL-based IV.
+static void
+optimizeSafeDivisorsToEVL(VPTypeAnalysis &TypeInfo, VPValue &AllOneMask,
+                          VPValue &EVL,
+                          SmallVectorImpl<VPRecipeBase *> &ToErase) {
+  using namespace VPlanPatternMatch;
+  for (VPUser *U : to_vector(EVL.users())) {
+    VPValue *Y;
+    if (!match(U, m_Intrinsic<Intrinsic::vp_merge>(m_AllOnes(), m_VPValue(Y),
+                                                   m_SpecificInt(1),
+                                                   m_Specific(&EVL))))
+      continue;
+    auto *Merge = cast<VPSingleDefRecipe>(U);
+
+    for (VPUser *User : to_vector(Merge->users())) {
+      auto *WidenR = dyn_cast<VPWidenRecipe>(User);
+      if (!WidenR || WidenR->getOperand(1) != Merge)
+        continue;
+      switch (WidenR->getOpcode()) {
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+        break;
+      default:
+        continue;
+      }
+      VPValue *X = WidenR->getOperand(0);
+
+      auto *VPUDiv = new VPWidenIntrinsicRecipe(
+          VPIntrinsic::getForOpcode(WidenR->getOpcode()),
+          {X, Y, &AllOneMask, &EVL}, TypeInfo.inferScalarType(Merge));
+      VPUDiv->insertBefore(WidenR);
+      WidenR->replaceAllUsesWith(VPUDiv);
+      ToErase.push_back(WidenR);
+    }
+  }
+}
+
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   Type *CanonicalIVType = Plan.getCanonicalIV()->getScalarType();
@@ -2259,6 +2305,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
     }
   }
 
+  optimizeSafeDivisorsToEVL(TypeInfo, *AllOneMask, EVL, ToErase);
+
   for (VPRecipeBase *R : reverse(ToErase)) {
     SmallVector<VPValue *> PossiblyDead(R->operands());
     R->eraseFromParent();
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
index 3e83d8a757b5d..9936b7ef0de54 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-div.ll
@@ -35,8 +35,7 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -131,8 +130,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = udiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.udiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -226,8 +224,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = srem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.srem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
@@ -321,8 +318,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[TMP9]], i32 0
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = urem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.urem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])