-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Lower VP_SELECT constant false to use vmerge.vxm/vmerge.vim #144461
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Liao Chunyu (ChunyuLiao) ChangesCurrently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM. By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop. corrent: https://godbolt.org/z/EnGMn3xeM Full diff: https://github.com/llvm/llvm-project/pull/144461.diff 4 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 7cfada6c0601c..ab36d0aeffa99 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8170,11 +8170,17 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerRESET_FPENV(Op, DAG);
case ISD::EH_DWARF_CFA:
return lowerEH_DWARF_CFA(Op, DAG);
+ case ISD::VP_SELECT:
+ if (SDValue Op2 = Op.getOperand(2);
+ Op2.hasOneUse() && (Op2.getOpcode() == ISD::SPLAT_VECTOR ||
+ Op2.getOpcode() == ISD::SPLAT_VECTOR_PARTS))
+ return lowerVPSelectConstantFalse(Op, DAG);
+ else
+ return lowerVPOp(Op, DAG);
case ISD::VP_MERGE:
if (Op.getSimpleValueType().getVectorElementType() == MVT::i1)
return lowerVPMergeMask(Op, DAG);
[[fallthrough]];
- case ISD::VP_SELECT:
case ISD::VP_ADD:
case ISD::VP_SUB:
case ISD::VP_MUL:
@@ -13176,6 +13182,22 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
return convertFromScalableVector(VT, Result, DAG, Subtarget);
}
+SDValue RISCVTargetLowering::lowerVPSelectConstantFalse(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ MVT VT = Op.getSimpleValueType();
+ SDValue TrueVal = Op.getOperand(1);
+ SDValue FalseVal = Op.getOperand(2);
+ SDValue VL = Op.getOperand(3);
+
+ MVT MaskVT = VT.changeVectorElementType(MVT::i1);
+ SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+ SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT,
+ Op.getOperand(0), AllOneMask, VL);
+ return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, NewMask, FalseVal, TrueVal,
+ DAG.getUNDEF(VT), VL);
+}
+
SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op,
SelectionDAG &DAG) const {
SDLoc DL(Op);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 417d684a62382..cf04e56f36288 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -477,6 +477,7 @@ class RISCVTargetLowering : public TargetLowering {
SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerVPSelectConstantFalse(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
index 75537406f3515..a9ed70b94c90f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/masked-load-int.ll
@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1>
; ZVE32: # %bb.0:
; ZVE32-NEXT: csrr a1, vlenb
; ZVE32-NEXT: srli a1, a1, 3
-; ZVE32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma
-; ZVE32-NEXT: vmv.v.i v8, 0
-; ZVE32-NEXT: vsetvli zero, a1, e8, mf4, ta, mu
+; ZVE32-NEXT: vsetvli zero, a1, e8, mf4, ta, ma
; ZVE32-NEXT: vle8.v v8, (a0), v0.t
+; ZVE32-NEXT: vmnot.m v0, v0
+; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0
; ZVE32-NEXT: ret
%load = call <vscale x 1 x i8> @llvm.masked.load.nxv1i8(ptr %a, i32 1, <vscale x 1 x i1> %mask, <vscale x 1 x i8> zeroinitializer)
ret <vscale x 1 x i8> %load
diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
index 371ec7c790dda..3918a8009fde8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll
@@ -470,6 +470,28 @@ define <vscale x 2 x i64> @select_nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i6
ret <vscale x 2 x i64> %v
}
+define <vscale x 2 x i64> @select_nxv2i64_constant_true(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_true:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v8, v8, -1, v0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> %b, i32 %evl)
+ ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @select_nxv2i64_constant_false(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: select_nxv2i64_constant_false:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT: vmnot.m v0, v0
+; CHECK-NEXT: li a0, 100
+; CHECK-NEXT: vmerge.vxm v8, v8, a0, v0
+; CHECK-NEXT: ret
+ %v = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> splat (i64 100), i32 %evl)
+ ret <vscale x 2 x i64> %v
+}
+
declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
define <vscale x 4 x i64> @select_nxv4i64(<vscale x 4 x i1> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c, i32 zeroext %evl) {
@@ -702,10 +724,10 @@ define <vscale x 16 x double> @select_nxv16f64(<vscale x 16 x i1> %a, <vscale x
; CHECK-NEXT: and a4, a5, a4
; CHECK-NEXT: vsetvli zero, a4, e64, m8, ta, ma
; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0
-; CHECK-NEXT: bltu a2, a1, .LBB48_2
+; CHECK-NEXT: bltu a2, a1, .LBB50_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: .LBB48_2:
+; CHECK-NEXT: .LBB50_2:
; CHECK-NEXT: vmv1r.v v0, v7
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
c1d7c1b
to
d5cae13
Compare
; ZVE32-NEXT: vle8.v v8, (a0), v0.t | ||
; ZVE32-NEXT: vmnot.m v0, v0 | ||
; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think using vmerge
is better. Usually vmerge
is slower.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think using
vmerge
is better. Usuallyvmerge
is slower.
Can you help explain why vmerge is slower? According to this example, https://github.com/llvm/llvm-project/blob/main/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s#L1477
vmv, 4 cycle
vmerge, 4 cycles
vmnot.m, 4 cycles
vsetvl, 3 cycles
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There can be some microarchitecture optimizations like mv/zero idioms elimination, that is my point. And for this test case, it is a regression since we generate vmnot.m+vmerge.vim
instead of a vmv.vi
, the latency is doubled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, for this case, it also saves one vsetvl instruction, but introduces one additional cycle of latency.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.
|
||
MVT MaskVT = VT.changeVectorElementType(MVT::i1); | ||
SDValue AllOneMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL); | ||
SDValue NewMask = DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskVT, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does the VMXOR get combined with any compare instruction that produces the mask?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I plan to write another patch to combine vmxor with setcc. This vmxor cannot be eliminated, now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
By switching to SDNode, we do not need additional patches, the existing optimization is reused. thanks.
; ZVE32-NEXT: vle8.v v8, (a0), v0.t | ||
; ZVE32-NEXT: vmnot.m v0, v0 | ||
; ZVE32-NEXT: vmerge.vim v8, v8, 0, v0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this does look like a regression. Particular since the vmv.vi would otherwise be loop invariant, and hoisted out, but the vmerge.vxm will not.
@@ -34,10 +34,10 @@ define <vscale x 1 x i8> @masked_load_passthru_nxv1i8(ptr %a, <vscale x 1 x i1> | |||
; ZVE32: # %bb.0: | |||
; ZVE32-NEXT: csrr a1, vlenb | |||
; ZVE32-NEXT: srli a1, a1, 3 | |||
; ZVE32-NEXT: vsetvli a2, zero, e8, mf4, ta, ma | |||
; ZVE32-NEXT: vmv.v.i v8, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
was this patch (partially) motivated by https://github.com/llvm/llvm-project/pull/144170/files#r2146240973 ? If that's the case, could we solve it by changing the VL optimizer? we don't need to splat 0 for VLMAX elements
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll take a look at the VL optimizer. I'm not sure what the case is expected to be updated to — the earlier comments 'vmv.v.i is an unmasked instruction anyway so the difference between ma/mu shouldn't matter', but that doesn't seem to be a problem anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what I meant was that ideally, instead of the existing
vsetvli a2, zero, e8, mf4, ta, ma
vmv.v.i v8, 0
vsetvli zero, a1, e8, mf4, ta, mu
vle8.v v8, (a0), v0.t
we can have
vsetvli zero, a1, e8, mf4, ta, mu
vmv.v.i v8, 0
vle8.v v8, (a0), v0.t
In other words, using a shorter VL for both instructions instead of using VLMAX on the splat (we generally don't like VL toggling). The reason I brought up ma/mu was because I was justifying the correctness for vmv.v.i
to use vsetvli zero, a1, e8, mf4, ta, mu
, and aside from the VL difference, mask policy is the only other difference between those two vsetvli.
Based on this, I think
vsetvli zero, a1, e8, mf4, ta, mu
vmv.v.i v8, 0
vle8.v v8, (a0), v0.t
will be faster than your proposed
vsetvli zero, a1, e8, mf4, ta, ma
vle8.v v8, (a0), v0.t
vmnot.m v0, v0
vmerge.vim v8, v8, 0, v0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It turns out we should fix RISCVVectorPeephole rather than VL optimizer, here is my solution for this particular VL toggling problem: #144759
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
corrent: https://godbolt.org/z/EnGMn3xeM
expected similar form: https://godbolt.org/z/nWhGM6Ej5
I initially tried to resolve the inconsistency between the two cases, which triggering this regression. Appreciate your explanation.
3151db1
to
d7354fa
Compare
Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM. By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop. corrent: https://godbolt.org/z/EnGMn3xeM expected similar form: https://godbolt.org/z/nWhGM6Ej5
d7354fa
to
ec74ddd
Compare
Currently, when the false path of a vp_select is a splat vector, it is lowered to a vmv_v_x/vmv_v_i. The vmv is hoisted out of the loop and the whole copy in loop body by MachineLICM.
By inverting the mask register and swapping the true and false values in the vp_select, we can eliminate some instructions inside the loop.
corrent: https://godbolt.org/z/EnGMn3xeM
expected similar form: https://godbolt.org/z/nWhGM6Ej5