From f590430cc01775983249c32c5ab08abe40de7ef8 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 8 Apr 2025 15:58:43 +0100 Subject: [PATCH 1/4] Precommit tests --- .../CodeGen/RISCV/riscv-codegenprepare-asm.ll | 102 +++++++++++++++++- .../CodeGen/RISCV/riscv-codegenprepare.ll | 90 ++++++++++++++++ 2 files changed, 191 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll index 32261ee47164e..d3db332e1dd51 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=riscv64 | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s ; Make sure we don't emit a pair of shift for the zext in the preheader. We @@ -127,3 +127,103 @@ for.body: ; preds = %for.body, %for.body %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body } + +define i1 @widen_anyof_rdx(ptr %p, i64 %n) { +; CHECK-LABEL: widen_anyof_rdx: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma +; CHECK-NEXT: vmclr.m v12 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: .LBB2_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub a3, a1, a2 +; CHECK-NEXT: slli a4, a2, 2 +; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v14, (a4) +; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma +; CHECK-NEXT: vmsne.vi v13, v14, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vmsltu.vx v14, v8, a3 +; CHECK-NEXT: vmand.mm v13, v13, v14 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: blt a2, a1, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: snez a0, a0 +; CHECK-NEXT: ret +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] + %avl = sub i64 %n, %iv + %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + + %gep = getelementptr i32, ptr %p, i64 %iv + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %evl) + %cmp = icmp ne %x, zeroinitializer + %rec = call @llvm.vp.merge( %cmp, splat (i1 true), %phi, i32 %evl) + + %evl.zext = zext i32 %evl to i64 + %iv.next = add i64 %iv, %evl.zext + %done = icmp sge i64 %iv.next, %n + br i1 %done, label %exit, label %loop +exit: + %res = call i1 @llvm.vector.reduce.or( %rec) + ret i1 %res +} + + +define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) { +; CHECK-LABEL: widen_anyof_rdx_use_in_loop: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma +; CHECK-NEXT: vmclr.m v12 +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: .LBB3_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub a3, a1, a2 +; CHECK-NEXT: slli a4, a2, 2 +; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vle32.v v14, (a4) +; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma +; CHECK-NEXT: vmsne.vi v13, v14, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; CHECK-NEXT: vmsltu.vx v14, v8, a3 +; CHECK-NEXT: vmand.mm v13, v13, v14 +; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: add a2, a2, a3 +; CHECK-NEXT: vsm.v v12, (a4) +; CHECK-NEXT: blt a2, a1, .LBB3_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: snez a0, a0 +; CHECK-NEXT: ret +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] + %avl = sub i64 %n, %iv + %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + + %gep = getelementptr i32, ptr %p, i64 %iv + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %evl) + %cmp = icmp ne %x, zeroinitializer + %rec = call @llvm.vp.merge( %cmp, splat (i1 true), %phi, i32 %evl) + + store %rec, ptr %gep + + %evl.zext = zext i32 %evl to i64 + %iv.next = add i64 %iv, %evl.zext + %done = icmp sge i64 %iv.next, %n + br i1 %done, label %exit, label %loop +exit: + %res = call i1 @llvm.vector.reduce.or( %rec) + ret i1 %res +} diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll index 2179a0d26cf98..3555309695f26 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll @@ -103,3 +103,93 @@ define i64 @bug(i32 %x) { %b = and i64 %a, 4294967295 ret i64 %b } + +define i1 @widen_anyof_rdx(ptr %p, i64 %n) { +; CHECK-LABEL: @widen_anyof_rdx( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]] +; CHECK-NEXT: [[X:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ne [[X]], zeroinitializer +; CHECK-NEXT: [[TMP4]] = call @llvm.vp.merge.nxv4i1( [[CMP]], splat (i1 true), [[PHI]], i32 [[EVL]]) +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]] +; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP4]]) +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] + %avl = sub i64 %n, %iv + %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + + %gep = getelementptr i32, ptr %p, i64 %iv + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %evl) + %cmp = icmp ne %x, zeroinitializer + %rec = call @llvm.vp.merge( %cmp, splat (i1 true), %phi, i32 %evl) + + %evl.zext = zext i32 %evl to i64 + %iv.next = add i64 %iv, %evl.zext + %done = icmp sge i64 %iv.next, %n + br i1 %done, label %exit, label %loop +exit: + %res = call i1 @llvm.vector.reduce.or( %rec) + ret i1 %res +} + + +define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) { +; CHECK-LABEL: @widen_anyof_rdx_use_in_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[REC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]] +; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]] +; CHECK-NEXT: [[X:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], splat (i1 true), i32 [[EVL]]) +; CHECK-NEXT: [[CMP:%.*]] = icmp ne [[X]], zeroinitializer +; CHECK-NEXT: [[REC]] = call @llvm.vp.merge.nxv4i1( [[CMP]], splat (i1 true), [[PHI]], i32 [[EVL]]) +; CHECK-NEXT: store [[REC]], ptr [[GEP]], align 1 +; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]] +; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[REC]]) +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + br label %loop +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] + %avl = sub i64 %n, %iv + %evl = call i32 @llvm.experimental.get.vector.length(i64 %avl, i32 4, i1 true) + + %gep = getelementptr i32, ptr %p, i64 %iv + %x = call @llvm.vp.load(ptr %gep, splat (i1 true), i32 %evl) + %cmp = icmp ne %x, zeroinitializer + %rec = call @llvm.vp.merge( %cmp, splat (i1 true), %phi, i32 %evl) + + store %rec, ptr %gep + + %evl.zext = zext i32 %evl to i64 + %iv.next = add i64 %iv, %evl.zext + %done = icmp sge i64 %iv.next, %n + br i1 %done, label %exit, label %loop +exit: + %res = call i1 @llvm.vector.reduce.or( %rec) + ret i1 %res +} From 7b4100dc226cb12856c0c1749786fd113c593873 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 8 Apr 2025 18:56:50 +0100 Subject: [PATCH 2/4] [RISCV] Widen i1 AnyOf reductions --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 75 +++++++++++++++++++ .../CodeGen/RISCV/riscv-codegenprepare-asm.ll | 48 ++++++------ .../CodeGen/RISCV/riscv-codegenprepare.ll | 10 ++- 3 files changed, 104 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index b5cb05f30fb26..d034d2c7270f8 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -25,6 +25,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -58,6 +59,7 @@ class RISCVCodeGenPrepare : public FunctionPass, bool visitAnd(BinaryOperator &BO); bool visitIntrinsicInst(IntrinsicInst &I); bool expandVPStrideLoad(IntrinsicInst &I); + bool widenVPMerge(IntrinsicInst &I); }; } // end anonymous namespace @@ -103,6 +105,76 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { return true; } +// With EVL tail folding, an AnyOf reduction will generate an i1 vp.merge like +// follows: +// +// loop: +// %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] +// %cmp = icmp ... +// %rec = call @llvm.vp.merge(%cmp, i1 true, %phi, %evl) +// ... +// middle: +// %res = call i1 @llvm.vector.reduce.or( %rec) +// +// However RVV doesn't have any tail undisturbed mask instructions and so we +// need a convoluted sequence of mask instructions to lower the i1 vp.merge: see +// llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll. +// +// To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will +// usually be folded into a masked vor.vv. +// +// loop: +// %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ] +// %cmp = icmp ... +// %rec = call @llvm.vp.merge(%cmp, i8 true, %phi, %evl) +// %trunc = trunc %rec to +// ... +// middle: +// %res = call i1 @llvm.vector.reduce.or( %rec) +// +// The trunc will normally be sunk outside of the loop, but even if there are +// users inside the loop it is still profitable. +bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) { + if (!II.getType()->getScalarType()->isIntegerTy(1)) + return false; + + Value *Mask, *True, *PhiV, *EVL; + using namespace PatternMatch; + if (!match(&II, + m_Intrinsic(m_Value(Mask), m_Value(True), + m_Value(PhiV), m_Value(EVL)))) + return false; + + auto *Phi = dyn_cast(PhiV); + if (!Phi || Phi->getNumUses() > 2 || Phi->getNumIncomingValues() != 2 || + !match(Phi->getIncomingValue(0), m_Zero()) || + Phi->getIncomingValue(1) != &II) + return false; + + Type *WideTy = + VectorType::get(IntegerType::getInt8Ty(II.getContext()), + cast(II.getType())->getElementCount()); + + IRBuilder<> Builder(Phi); + PHINode *WidePhi = Builder.CreatePHI(WideTy, 2); + WidePhi->addIncoming(ConstantAggregateZero::get(WideTy), + Phi->getIncomingBlock(0)); + Builder.SetInsertPoint(&II); + Value *WideTrue = Builder.CreateZExt(True, WideTy); + Value *WideMerge = Builder.CreateIntrinsic(Intrinsic::vp_merge, {WideTy}, + {Mask, WideTrue, WidePhi, EVL}); + WidePhi->addIncoming(WideMerge, Phi->getIncomingBlock(1)); + Value *Trunc = Builder.CreateTrunc(WideMerge, II.getType()); + + II.replaceAllUsesWith(Trunc); + + // Break the cycle and delete the old chain. + Phi->setIncomingValue(1, Phi->getIncomingValue(0)); + llvm::RecursivelyDeleteTriviallyDeadInstructions(&II); + + return true; +} + // LLVM vector reduction intrinsics return a scalar result, but on RISC-V vector // reduction instructions write the result in the first element of a vector // register. So when a reduction in a loop uses a scalar phi, we end up with @@ -138,6 +210,9 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { if (expandVPStrideLoad(I)) return true; + if (widenVPMerge(I)) + return true; + if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd && !isa(&I)) return false; diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll index d3db332e1dd51..6136c321c08ca 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll @@ -132,26 +132,25 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) { ; CHECK-LABEL: widen_anyof_rdx: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma -; CHECK-NEXT: vmclr.m v12 -; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB2_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: slli a4, a2, 2 -; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma ; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vle32.v v14, (a4) -; CHECK-NEXT: vsetvli a4, zero, e32, m2, ta, ma -; CHECK-NEXT: vmsne.vi v13, v14, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmsltu.vx v14, v8, a3 -; CHECK-NEXT: vmand.mm v13, v13, v14 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: vmsne.vi v0, v10, 0 ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: blt a2, a1, .LBB2_1 ; CHECK-NEXT: # %bb.2: # %exit -; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v8, v8, 0 +; CHECK-NEXT: vcpop.m a0, v8 ; CHECK-NEXT: snez a0, a0 ; CHECK-NEXT: ret entry: @@ -181,27 +180,26 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) { ; CHECK-LABEL: widen_anyof_rdx_use_in_loop: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: vsetvli a3, zero, e64, m4, ta, ma -; CHECK-NEXT: vmclr.m v12 -; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vsetvli a3, zero, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB3_1: # %loop ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: sub a3, a1, a2 ; CHECK-NEXT: slli a4, a2, 2 -; CHECK-NEXT: vsetvli a3, a3, e8, mf2, ta, ma +; CHECK-NEXT: vsetvli a3, a3, e32, m2, ta, ma ; CHECK-NEXT: add a4, a0, a4 -; CHECK-NEXT: vle32.v v14, (a4) -; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma -; CHECK-NEXT: vmsne.vi v13, v14, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vmsltu.vx v14, v8, a3 -; CHECK-NEXT: vmand.mm v13, v13, v14 -; CHECK-NEXT: vmor.mm v12, v12, v13 +; CHECK-NEXT: vle32.v v10, (a4) +; CHECK-NEXT: vmsne.vi v0, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, tu, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma +; CHECK-NEXT: vand.vi v9, v8, 1 +; CHECK-NEXT: vmsne.vi v9, v9, 0 ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: vsm.v v12, (a4) +; CHECK-NEXT: vsm.v v9, (a4) ; CHECK-NEXT: blt a2, a1, .LBB3_1 ; CHECK-NEXT: # %bb.2: # %exit -; CHECK-NEXT: vcpop.m a0, v12 +; CHECK-NEXT: vcpop.m a0, v9 ; CHECK-NEXT: snez a0, a0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll index 3555309695f26..cf5d0f107359a 100644 --- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll +++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll @@ -110,13 +110,14 @@ define i1 @widen_anyof_rdx(ptr %p, i64 %n) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]] ; CHECK-NEXT: [[X:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp ne [[X]], zeroinitializer -; CHECK-NEXT: [[TMP4]] = call @llvm.vp.merge.nxv4i1( [[CMP]], splat (i1 true), [[PHI]], i32 [[EVL]]) +; CHECK-NEXT: [[TMP1]] = call @llvm.vp.merge.nxv4i8( [[CMP]], splat (i8 1), [[TMP0]], i32 [[EVL]]) +; CHECK-NEXT: [[TMP4:%.*]] = trunc [[TMP1]] to ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]] ; CHECK-NEXT: [[DONE:%.*]] = icmp sge i64 [[IV_NEXT]], [[N]] @@ -154,13 +155,14 @@ define i1 @widen_anyof_rdx_use_in_loop(ptr %p, i64 %n) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PHI:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[REC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi [ zeroinitializer, [[ENTRY]] ], [ [[TMP1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N:%.*]], [[IV]] ; CHECK-NEXT: [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IV]] ; CHECK-NEXT: [[X:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr [[GEP]], splat (i1 true), i32 [[EVL]]) ; CHECK-NEXT: [[CMP:%.*]] = icmp ne [[X]], zeroinitializer -; CHECK-NEXT: [[REC]] = call @llvm.vp.merge.nxv4i1( [[CMP]], splat (i1 true), [[PHI]], i32 [[EVL]]) +; CHECK-NEXT: [[TMP1]] = call @llvm.vp.merge.nxv4i8( [[CMP]], splat (i8 1), [[TMP0]], i32 [[EVL]]) +; CHECK-NEXT: [[REC:%.*]] = trunc [[TMP1]] to ; CHECK-NEXT: store [[REC]], ptr [[GEP]], align 1 ; CHECK-NEXT: [[EVL_ZEXT:%.*]] = zext i32 [[EVL]] to i64 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[EVL_ZEXT]] From 678851b52c1e830238a5cee3e8bddd89a108faff Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 21 Apr 2025 18:01:47 +0800 Subject: [PATCH 3/4] Reduce num of phi uses needed to 1 Previously when we were checking for we were matching the or it was also a use on the phi --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index d034d2c7270f8..8bd0d0be88c64 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -146,7 +146,7 @@ bool RISCVCodeGenPrepare::widenVPMerge(IntrinsicInst &II) { return false; auto *Phi = dyn_cast(PhiV); - if (!Phi || Phi->getNumUses() > 2 || Phi->getNumIncomingValues() != 2 || + if (!Phi || !Phi->hasOneUse() || Phi->getNumIncomingValues() != 2 || !match(Phi->getIncomingValue(0), m_Zero()) || Phi->getIncomingValue(1) != &II) return false; From 39f50e60f9ce5f40919740e1b7ead9deb90db27d Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 21 Apr 2025 19:48:55 +0800 Subject: [PATCH 4/4] Update comment --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 8bd0d0be88c64..ce349598bd9b1 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -121,7 +121,7 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { // llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll. // // To avoid that this widens the i1 vp.merge to an i8 vp.merge, which will -// usually be folded into a masked vor.vv. +// generate a single vmerge.vim: // // loop: // %phi = phi [ zeroinitializer, %entry ], [ %rec, %loop ]