Skip to content

Commit eb7e199

Browse files
authored
[RISCV][VLOPT] Allow users that are passthrus if tail elements aren't demanded (#124066)
The motivation for this to allow reducing the vl when a user is a ternary pseudo, where the third operand is tied and also acts as a passthru. When checking the users of an instruction, we currently bail if the user is used as a passthru because all of its elements past vl will be used for the tail. We can allow passthru users if we know the tail of their result isn't used, which we will have computed beforehand after #124530 It's worth noting that this is all irrelevant of the tail policy, because tail agnostic still ends up using the passthru. I've checked that SPEC CPU 2017 + llvm-test-suite pass with this (on qemu with rvv_ta_all_1s=true) Fixes #123760
1 parent 83a1fe8 commit eb7e199

File tree

4 files changed

+115
-36
lines changed

4 files changed

+115
-36
lines changed

llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp

+19-12
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,25 @@ RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
11881188
return std::nullopt;
11891189
}
11901190

1191+
unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
1192+
const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
1193+
// Looking for an immediate or a register VL that isn't X0.
1194+
assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
1195+
"Did not expect X0 VL");
1196+
1197+
// If the user is a passthru it will read the elements past VL, so
1198+
// abort if any of the elements past VL are demanded.
1199+
if (UserOp.isTied()) {
1200+
assert(UserOp.getOperandNo() == UserMI.getNumExplicitDefs() &&
1201+
RISCVII::isFirstDefTiedToFirstUse(UserMI.getDesc()));
1202+
auto DemandedVL = DemandedVLs[&UserMI];
1203+
if (!DemandedVL || !RISCV::isVLKnownLE(*DemandedVL, VLOp)) {
1204+
LLVM_DEBUG(dbgs() << " Abort because user is passthru in "
1205+
"instruction with demanded tail\n");
1206+
return std::nullopt;
1207+
}
1208+
}
1209+
11911210
// Instructions like reductions may use a vector register as a scalar
11921211
// register. In this case, we should treat it as only reading the first lane.
11931212
if (isVectorOpUsedAsScalarOp(UserOp)) {
@@ -1200,12 +1219,6 @@ RISCVVLOptimizer::getMinimumVLForUser(MachineOperand &UserOp) {
12001219
return MachineOperand::CreateImm(1);
12011220
}
12021221

1203-
unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
1204-
const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
1205-
// Looking for an immediate or a register VL that isn't X0.
1206-
assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
1207-
"Did not expect X0 VL");
1208-
12091222
// If we know the demanded VL of UserMI, then we can reduce the VL it
12101223
// requires.
12111224
if (auto DemandedVL = DemandedVLs[&UserMI]) {
@@ -1227,12 +1240,6 @@ std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
12271240
return std::nullopt;
12281241
}
12291242

1230-
// If used as a passthru, elements past VL will be read.
1231-
if (UserOp.isTied()) {
1232-
LLVM_DEBUG(dbgs() << " Abort because user used as tied operand\n");
1233-
return std::nullopt;
1234-
}
1235-
12361243
auto VLOp = getMinimumVLForUser(UserOp);
12371244
if (!VLOp)
12381245
return std::nullopt;

llvm/test/CodeGen/RISCV/rvv/vl-opt.ll

+23
Original file line numberDiff line numberDiff line change
@@ -194,3 +194,26 @@ define <vscale x 4 x i32> @dont_optimize_tied_def(<vscale x 4 x i32> %a, <vscale
194194
ret <vscale x 4 x i32> %2
195195
}
196196

197+
define void @optimize_ternary_use(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %p, iXLen %vl) {
198+
; NOVLOPT-LABEL: optimize_ternary_use:
199+
; NOVLOPT: # %bb.0:
200+
; NOVLOPT-NEXT: vsetvli a2, zero, e32, m2, ta, ma
201+
; NOVLOPT-NEXT: vzext.vf2 v14, v8
202+
; NOVLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
203+
; NOVLOPT-NEXT: vmadd.vv v14, v10, v12
204+
; NOVLOPT-NEXT: vse32.v v14, (a0)
205+
; NOVLOPT-NEXT: ret
206+
;
207+
; VLOPT-LABEL: optimize_ternary_use:
208+
; VLOPT: # %bb.0:
209+
; VLOPT-NEXT: vsetvli zero, a1, e32, m2, ta, ma
210+
; VLOPT-NEXT: vzext.vf2 v14, v8
211+
; VLOPT-NEXT: vmadd.vv v14, v10, v12
212+
; VLOPT-NEXT: vse32.v v14, (a0)
213+
; VLOPT-NEXT: ret
214+
%1 = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
215+
%2 = mul <vscale x 4 x i32> %b, %1
216+
%3 = add <vscale x 4 x i32> %2, %c
217+
call void @llvm.riscv.vse(<vscale x 4 x i32> %3, ptr %p, iXLen %vl)
218+
ret void
219+
}

llvm/test/CodeGen/RISCV/rvv/vl-opt.mir

+61
Original file line numberDiff line numberDiff line change
@@ -209,3 +209,64 @@ body: |
209209
bb.1:
210210
%y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
211211
PseudoRET
212+
...
213+
---
214+
# Can reduce %x even though %y uses it as a passthru, because %y's inactive elements aren't demanded
215+
name: passthru_not_demanded
216+
body: |
217+
bb.0:
218+
; CHECK-LABEL: name: passthru_not_demanded
219+
; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
220+
; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
221+
; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
222+
%x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
223+
%y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
224+
%z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
225+
...
226+
---
227+
# Can't reduce %x because %y uses it as a passthru, and %y's inactive elements are demanded by %z
228+
name: passthru_demanded
229+
body: |
230+
bb.0:
231+
; CHECK-LABEL: name: passthru_demanded
232+
; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
233+
; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
234+
; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
235+
%x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
236+
%y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
237+
%z:vr = PseudoVADD_VV_M1 $noreg, %y, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
238+
...
239+
---
240+
# Can reduce %x even though %y uses it as a passthru, because %y's inactive elements aren't demanded
241+
name: passthru_not_demanded_passthru_chain
242+
body: |
243+
bb.0:
244+
; CHECK-LABEL: name: passthru_not_demanded_passthru_chain
245+
; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
246+
; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
247+
; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 %y, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
248+
; CHECK-NEXT: %a:vr = PseudoVADD_VV_M1 %z, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
249+
; CHECK-NEXT: %b:vr = PseudoVADD_VV_M1 $noreg, %a, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
250+
%x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
251+
%y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
252+
%z:vr = PseudoVADD_VV_M1 %y, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
253+
%a:vr = PseudoVADD_VV_M1 %z, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
254+
%b:vr = PseudoVADD_VV_M1 $noreg, %a, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
255+
...
256+
---
257+
# Can't reduce %x because %y uses it as a passthru, and %y's inactive elements are ultimately demanded in %b
258+
name: passthru_demanded_passthru_chain
259+
body: |
260+
bb.0:
261+
; CHECK-LABEL: name: passthru_demanded_passthru_chain
262+
; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
263+
; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
264+
; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 %y, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
265+
; CHECK-NEXT: %a:vr = PseudoVADD_VV_M1 %z, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
266+
; CHECK-NEXT: %b:vr = PseudoVADD_VV_M1 $noreg, %a, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
267+
%x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
268+
%y:vr = PseudoVADD_VV_M1 %x, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
269+
%z:vr = PseudoVADD_VV_M1 %y, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
270+
%a:vr = PseudoVADD_VV_M1 %z, $noreg, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
271+
%b:vr = PseudoVADD_VV_M1 $noreg, %a, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
272+
...

llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll

+12-24
Original file line numberDiff line numberDiff line change
@@ -1638,9 +1638,8 @@ define <vscale x 1 x i64> @vmadd_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vsca
16381638
; RV32-NEXT: sw a0, 8(sp)
16391639
; RV32-NEXT: sw a1, 12(sp)
16401640
; RV32-NEXT: addi a0, sp, 8
1641-
; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
1642-
; RV32-NEXT: vlse64.v v10, (a0), zero
16431641
; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma
1642+
; RV32-NEXT: vlse64.v v10, (a0), zero
16441643
; RV32-NEXT: vmadd.vv v10, v8, v9
16451644
; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma
16461645
; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
@@ -1669,9 +1668,8 @@ define <vscale x 1 x i64> @vmadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64
16691668
; RV32-NEXT: sw a0, 8(sp)
16701669
; RV32-NEXT: sw a1, 12(sp)
16711670
; RV32-NEXT: addi a0, sp, 8
1672-
; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
1673-
; RV32-NEXT: vlse64.v v10, (a0), zero
16741671
; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma
1672+
; RV32-NEXT: vlse64.v v10, (a0), zero
16751673
; RV32-NEXT: vmadd.vv v10, v8, v9
16761674
; RV32-NEXT: vsetvli zero, zero, e64, m1, tu, ma
16771675
; RV32-NEXT: vmv.v.v v8, v10
@@ -1713,9 +1711,8 @@ define <vscale x 1 x i64> @vmadd_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <v
17131711
; RV32-NEXT: sw a0, 8(sp)
17141712
; RV32-NEXT: sw a1, 12(sp)
17151713
; RV32-NEXT: addi a0, sp, 8
1716-
; RV32-NEXT: vsetvli a1, zero, e64, m1, ta, ma
1717-
; RV32-NEXT: vlse64.v v10, (a0), zero
17181714
; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, ma
1715+
; RV32-NEXT: vlse64.v v10, (a0), zero
17191716
; RV32-NEXT: vmadd.vv v10, v8, v9
17201717
; RV32-NEXT: vmerge.vvm v8, v8, v10, v0
17211718
; RV32-NEXT: addi sp, sp, 16
@@ -1776,9 +1773,8 @@ define <vscale x 2 x i64> @vmadd_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vsca
17761773
; RV32-NEXT: sw a0, 8(sp)
17771774
; RV32-NEXT: sw a1, 12(sp)
17781775
; RV32-NEXT: addi a0, sp, 8
1779-
; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
1780-
; RV32-NEXT: vlse64.v v12, (a0), zero
17811776
; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma
1777+
; RV32-NEXT: vlse64.v v12, (a0), zero
17821778
; RV32-NEXT: vmadd.vv v12, v8, v10
17831779
; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma
17841780
; RV32-NEXT: vmerge.vvm v8, v8, v12, v0
@@ -1807,9 +1803,8 @@ define <vscale x 2 x i64> @vmadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64
18071803
; RV32-NEXT: sw a0, 8(sp)
18081804
; RV32-NEXT: sw a1, 12(sp)
18091805
; RV32-NEXT: addi a0, sp, 8
1810-
; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
1811-
; RV32-NEXT: vlse64.v v12, (a0), zero
18121806
; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma
1807+
; RV32-NEXT: vlse64.v v12, (a0), zero
18131808
; RV32-NEXT: vmadd.vv v12, v8, v10
18141809
; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma
18151810
; RV32-NEXT: vmv.v.v v8, v12
@@ -1851,9 +1846,8 @@ define <vscale x 2 x i64> @vmadd_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <v
18511846
; RV32-NEXT: sw a0, 8(sp)
18521847
; RV32-NEXT: sw a1, 12(sp)
18531848
; RV32-NEXT: addi a0, sp, 8
1854-
; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma
1855-
; RV32-NEXT: vlse64.v v12, (a0), zero
18561849
; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, ma
1850+
; RV32-NEXT: vlse64.v v12, (a0), zero
18571851
; RV32-NEXT: vmadd.vv v12, v8, v10
18581852
; RV32-NEXT: vmerge.vvm v8, v8, v12, v0
18591853
; RV32-NEXT: addi sp, sp, 16
@@ -1914,9 +1908,8 @@ define <vscale x 4 x i64> @vmadd_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vsca
19141908
; RV32-NEXT: sw a0, 8(sp)
19151909
; RV32-NEXT: sw a1, 12(sp)
19161910
; RV32-NEXT: addi a0, sp, 8
1917-
; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
1918-
; RV32-NEXT: vlse64.v v16, (a0), zero
19191911
; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma
1912+
; RV32-NEXT: vlse64.v v16, (a0), zero
19201913
; RV32-NEXT: vmadd.vv v16, v8, v12
19211914
; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma
19221915
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
@@ -1945,9 +1938,8 @@ define <vscale x 4 x i64> @vmadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64
19451938
; RV32-NEXT: sw a0, 8(sp)
19461939
; RV32-NEXT: sw a1, 12(sp)
19471940
; RV32-NEXT: addi a0, sp, 8
1948-
; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
1949-
; RV32-NEXT: vlse64.v v16, (a0), zero
19501941
; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma
1942+
; RV32-NEXT: vlse64.v v16, (a0), zero
19511943
; RV32-NEXT: vmadd.vv v16, v8, v12
19521944
; RV32-NEXT: vsetvli zero, zero, e64, m4, tu, ma
19531945
; RV32-NEXT: vmv.v.v v8, v16
@@ -1989,9 +1981,8 @@ define <vscale x 4 x i64> @vmadd_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <v
19891981
; RV32-NEXT: sw a0, 8(sp)
19901982
; RV32-NEXT: sw a1, 12(sp)
19911983
; RV32-NEXT: addi a0, sp, 8
1992-
; RV32-NEXT: vsetvli a1, zero, e64, m4, ta, ma
1993-
; RV32-NEXT: vlse64.v v16, (a0), zero
19941984
; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, ma
1985+
; RV32-NEXT: vlse64.v v16, (a0), zero
19951986
; RV32-NEXT: vmadd.vv v16, v8, v12
19961987
; RV32-NEXT: vmerge.vvm v8, v8, v16, v0
19971988
; RV32-NEXT: addi sp, sp, 16
@@ -2054,9 +2045,8 @@ define <vscale x 8 x i64> @vmadd_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vsca
20542045
; RV32-NEXT: sw a0, 8(sp)
20552046
; RV32-NEXT: sw a1, 12(sp)
20562047
; RV32-NEXT: addi a0, sp, 8
2057-
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
2058-
; RV32-NEXT: vlse64.v v24, (a0), zero
20592048
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
2049+
; RV32-NEXT: vlse64.v v24, (a0), zero
20602050
; RV32-NEXT: vmadd.vv v24, v8, v16
20612051
; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma
20622052
; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
@@ -2085,9 +2075,8 @@ define <vscale x 8 x i64> @vmadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64
20852075
; RV32-NEXT: sw a0, 8(sp)
20862076
; RV32-NEXT: sw a1, 12(sp)
20872077
; RV32-NEXT: addi a0, sp, 8
2088-
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
2089-
; RV32-NEXT: vlse64.v v24, (a0), zero
20902078
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
2079+
; RV32-NEXT: vlse64.v v24, (a0), zero
20912080
; RV32-NEXT: vmadd.vv v24, v8, v16
20922081
; RV32-NEXT: vsetvli zero, zero, e64, m8, tu, ma
20932082
; RV32-NEXT: vmv.v.v v8, v24
@@ -2130,9 +2119,8 @@ define <vscale x 8 x i64> @vmadd_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <v
21302119
; RV32-NEXT: sw a0, 8(sp)
21312120
; RV32-NEXT: sw a1, 12(sp)
21322121
; RV32-NEXT: addi a0, sp, 8
2133-
; RV32-NEXT: vsetvli a1, zero, e64, m8, ta, ma
2134-
; RV32-NEXT: vlse64.v v24, (a0), zero
21352122
; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
2123+
; RV32-NEXT: vlse64.v v24, (a0), zero
21362124
; RV32-NEXT: vmadd.vv v24, v8, v16
21372125
; RV32-NEXT: vmerge.vvm v8, v8, v24, v0
21382126
; RV32-NEXT: addi sp, sp, 16

0 commit comments

Comments
 (0)