-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Separate doLocalPostpass into new pass and move to post vector regalloc #88295
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesThis patch splits off part of the work to move vsetvli insertion to post regalloc in #70549. The doLocalPostpass operates outside of RISCVInsertVSETVLis dataflow, so we can move it to its own pass. We can then move it to post vector regalloc, which should be a smaller change since it only touches GPR registers. A couple of things that are different from #70549:
Running on llvm-test-suite and SPEC CPU 2017 there aren't any changes in the number of vsetvls removed. We could also potentially turn off this pass for unoptimised builds. Patch is 292.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88295.diff 34 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 7af543f018ccbd..d405395dcf9ec4 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -61,6 +61,9 @@ void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
FunctionPass *createRISCVInsertVSETVLIPass();
void initializeRISCVInsertVSETVLIPass(PassRegistry &);
+FunctionPass *createRISCVCoalesceVSETVLIPass();
+void initializeRISCVCoalesceVSETVLIPass(PassRegistry &);
+
FunctionPass *createRISCVPostRAExpandPseudoPass();
void initializeRISCVPostRAExpandPseudoPass(PassRegistry &);
FunctionPass *createRISCVInsertReadWriteCSRPass();
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a14f9a28354737..b774cc37ad79a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -28,15 +28,17 @@
#include "RISCVSubtarget.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <queue>
using namespace llvm;
#define DEBUG_TYPE "riscv-insert-vsetvli"
#define RISCV_INSERT_VSETVLI_NAME "RISC-V Insert VSETVLI pass"
+#define RISCV_COALESCE_VSETVLI_NAME "RISC-V Coalesce VSETVLI pass"
STATISTIC(NumInsertedVSETVL, "Number of VSETVL inst inserted");
-STATISTIC(NumRemovedVSETVL, "Number of VSETVL inst removed");
+STATISTIC(NumCoalescedVSETVL, "Number of VSETVL inst coalesced");
static cl::opt<bool> DisableInsertVSETVLPHIOpt(
"riscv-disable-insert-vsetvl-phi-opt", cl::init(false), cl::Hidden,
@@ -190,6 +192,11 @@ static bool hasUndefinedMergeOp(const MachineInstr &MI,
if (UseMO.getReg() == RISCV::NoRegister)
return true;
+ if (UseMO.isUndef())
+ return true;
+ if (UseMO.getReg().isPhysical())
+ return false;
+
if (MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg())) {
if (UseMI->isImplicitDef())
return true;
@@ -777,11 +784,32 @@ class RISCVInsertVSETVLI : public MachineFunctionPass {
VSETVLIInfo &Info) const;
void computeIncomingVLVTYPE(const MachineBasicBlock &MBB);
void emitVSETVLIs(MachineBasicBlock &MBB);
- void doLocalPostpass(MachineBasicBlock &MBB);
void doPRE(MachineBasicBlock &MBB);
void insertReadVL(MachineBasicBlock &MBB);
};
+class RISCVCoalesceVSETVLI : public MachineFunctionPass {
+public:
+ static char ID;
+
+ RISCVCoalesceVSETVLI() : MachineFunctionPass(ID) {}
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+
+ AU.addRequired<LiveIntervals>();
+ AU.addPreserved<LiveIntervals>();
+ AU.addRequired<SlotIndexes>();
+ AU.addPreserved<SlotIndexes>();
+ AU.addPreserved<LiveStacks>();
+
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override { return RISCV_COALESCE_VSETVLI_NAME; }
+};
+
} // end anonymous namespace
char RISCVInsertVSETVLI::ID = 0;
@@ -789,6 +817,11 @@ char RISCVInsertVSETVLI::ID = 0;
INITIALIZE_PASS(RISCVInsertVSETVLI, DEBUG_TYPE, RISCV_INSERT_VSETVLI_NAME,
false, false)
+char RISCVCoalesceVSETVLI::ID = 0;
+
+INITIALIZE_PASS(RISCVCoalesceVSETVLI, "riscv-coalesce-vsetvli",
+ RISCV_COALESCE_VSETVLI_NAME, false, false)
+
// Return a VSETVLIInfo representing the changes made by this VSETVLI or
// VSETIVLI instruction.
static VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) {
@@ -1510,7 +1543,10 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
auto &AVL = MI.getOperand(1);
auto &PrevAVL = PrevMI.getOperand(1);
- assert(MRI.isSSA());
+ assert(!AVL.isReg() || !AVL.getReg().isVirtual() ||
+ MRI.hasOneDef(AVL.getReg()));
+ assert(!PrevAVL.isReg() || !PrevAVL.getReg().isVirtual() ||
+ MRI.hasOneDef(PrevAVL.getReg()));
// If the AVL is a register, we need to make sure MI's AVL dominates PrevMI.
// For now just check that PrevMI uses the same virtual register.
@@ -1530,64 +1566,6 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
return areCompatibleVTYPEs(PriorVType, VType, Used);
}
-void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
- MachineInstr *NextMI = nullptr;
- // We can have arbitrary code in successors, so VL and VTYPE
- // must be considered demanded.
- DemandedFields Used;
- Used.demandVL();
- Used.demandVTYPE();
- SmallVector<MachineInstr*> ToDelete;
- for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
-
- if (!isVectorConfigInstr(MI)) {
- doUnion(Used, getDemanded(MI, MRI, ST));
- continue;
- }
-
- Register VRegDef = MI.getOperand(0).getReg();
- if (VRegDef != RISCV::X0 &&
- !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef)))
- Used.demandVL();
-
- if (NextMI) {
- if (!Used.usedVL() && !Used.usedVTYPE()) {
- ToDelete.push_back(&MI);
- // Leave NextMI unchanged
- continue;
- } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
- if (!isVLPreservingConfig(*NextMI)) {
- MI.getOperand(0).setReg(NextMI->getOperand(0).getReg());
- MI.getOperand(0).setIsDead(false);
- Register OldVLReg;
- if (MI.getOperand(1).isReg())
- OldVLReg = MI.getOperand(1).getReg();
- if (NextMI->getOperand(1).isImm())
- MI.getOperand(1).ChangeToImmediate(NextMI->getOperand(1).getImm());
- else
- MI.getOperand(1).ChangeToRegister(NextMI->getOperand(1).getReg(), false);
- if (OldVLReg) {
- MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
- if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
- MRI->use_nodbg_empty(OldVLReg))
- VLOpDef->eraseFromParent();
- }
- MI.setDesc(NextMI->getDesc());
- }
- MI.getOperand(2).setImm(NextMI->getOperand(2).getImm());
- ToDelete.push_back(NextMI);
- // fallthrough
- }
- }
- NextMI = &MI;
- Used = getDemanded(MI, MRI, ST);
- }
-
- NumRemovedVSETVL += ToDelete.size();
- for (auto *MI : ToDelete)
- MI->eraseFromParent();
-}
-
void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
MachineInstr &MI = *I++;
@@ -1660,15 +1638,6 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
emitVSETVLIs(MBB);
- // Now that all vsetvlis are explicit, go through and do block local
- // DSE and peephole based demanded fields based transforms. Note that
- // this *must* be done outside the main dataflow so long as we allow
- // any cross block analysis within the dataflow. We can't have both
- // demanded fields based mutation and non-local analysis in the
- // dataflow at the same time without introducing inconsistencies.
- for (MachineBasicBlock &MBB : MF)
- doLocalPostpass(MBB);
-
// Insert PseudoReadVL after VLEFF/VLSEGFF and replace it with the vl output
// of VLEFF/VLSEGFF.
for (MachineBasicBlock &MBB : MF)
@@ -1682,3 +1651,121 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
FunctionPass *llvm::createRISCVInsertVSETVLIPass() {
return new RISCVInsertVSETVLI();
}
+
+// Now that all vsetvlis are explicit, go through and do block local
+// DSE and peephole based demanded fields based transforms. Note that
+// this *must* be done outside the main dataflow so long as we allow
+// any cross block analysis within the dataflow. We can't have both
+// demanded fields based mutation and non-local analysis in the
+// dataflow at the same time without introducing inconsistencies.
+bool RISCVCoalesceVSETVLI::runOnMachineFunction(MachineFunction &MF) {
+ // Skip if the vector extension is not enabled.
+ auto *ST = &MF.getSubtarget<RISCVSubtarget>();
+ if (!ST->hasVInstructions())
+ return false;
+
+ LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+ bool Changed = false;
+
+ const auto *TII = ST->getInstrInfo();
+ auto *MRI = &MF.getRegInfo();
+ for (MachineBasicBlock &MBB : MF) {
+ MachineInstr *NextMI = nullptr;
+ // We can have arbitrary code in successors, so VL and VTYPE
+ // must be considered demanded.
+ DemandedFields Used;
+ Used.demandVL();
+ Used.demandVTYPE();
+ SmallVector<MachineInstr *> ToDelete;
+ for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+
+ if (!isVectorConfigInstr(MI)) {
+ doUnion(Used, getDemanded(MI, MRI, ST));
+ continue;
+ }
+
+ Register VRegDef = MI.getOperand(0).getReg();
+ if (VRegDef != RISCV::X0 &&
+ !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef)))
+ Used.demandVL();
+
+ if (NextMI) {
+ if (!Used.usedVL() && !Used.usedVTYPE()) {
+ ToDelete.push_back(&MI);
+ // Leave NextMI unchanged
+ continue;
+ } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
+ if (!isVLPreservingConfig(*NextMI)) {
+
+ Register DefReg = NextMI->getOperand(0).getReg();
+
+ MI.getOperand(0).setReg(DefReg);
+ MI.getOperand(0).setIsDead(false);
+
+ // The def of DefReg moved to MI, so extend the LiveInterval up to
+ // it.
+ if (DefReg.isVirtual()) {
+ LiveInterval &DefLI = LIS.getInterval(DefReg);
+ SlotIndex MISlot = LIS.getInstructionIndex(MI).getRegSlot();
+ VNInfo *DefVNI = DefLI.getVNInfoAt(DefLI.beginIndex());
+ LiveInterval::Segment S(MISlot, DefLI.beginIndex(), DefVNI);
+ DefLI.addSegment(S);
+ DefVNI->def = MISlot;
+
+ // DefReg may have had no uses, in which case we need to shrink
+ // the LiveInterval up to MI.
+ LIS.shrinkToUses(&DefLI);
+ }
+
+ Register OldVLReg;
+ if (MI.getOperand(1).isReg())
+ OldVLReg = MI.getOperand(1).getReg();
+ if (NextMI->getOperand(1).isImm())
+ MI.getOperand(1).ChangeToImmediate(
+ NextMI->getOperand(1).getImm());
+ else
+ MI.getOperand(1).ChangeToRegister(NextMI->getOperand(1).getReg(),
+ false);
+
+ // Clear NextMI's AVL early so we're not counting it as a use.
+ if (NextMI->getOperand(1).isReg())
+ NextMI->getOperand(1).setReg(RISCV::NoRegister);
+
+ if (OldVLReg) {
+ MachineInstr *VLOpDef = MRI->getUniqueVRegDef(OldVLReg);
+ if (VLOpDef && TII->isAddImmediate(*VLOpDef, OldVLReg) &&
+ MRI->use_nodbg_empty(OldVLReg)) {
+ VLOpDef->eraseFromParent();
+ }
+
+ // NextMI no longer uses OldVLReg so shrink its LiveInterval.
+ if (OldVLReg.isVirtual())
+ LIS.shrinkToUses(&LIS.getInterval(OldVLReg));
+ }
+
+ MI.setDesc(NextMI->getDesc());
+ }
+ MI.getOperand(2).setImm(NextMI->getOperand(2).getImm());
+ ToDelete.push_back(NextMI);
+ // fallthrough
+ }
+ }
+ NextMI = &MI;
+ Used = getDemanded(MI, MRI, ST);
+ }
+
+ Changed |= !ToDelete.empty();
+ NumCoalescedVSETVL += ToDelete.size();
+ for (auto *MI : ToDelete) {
+ LIS.RemoveMachineInstrFromMaps(*MI);
+ MI->eraseFromParent();
+ }
+ }
+
+ return Changed;
+}
+
+FunctionPass *llvm::createRISCVCoalesceVSETVLIPass() {
+ return new RISCVCoalesceVSETVLI();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae1a6f179a49e3..2a56ce03d38a93 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -121,6 +121,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
initializeRISCVExpandPseudoPass(*PR);
initializeRISCVFoldMasksPass(*PR);
initializeRISCVInsertVSETVLIPass(*PR);
+ initializeRISCVCoalesceVSETVLIPass(*PR);
initializeRISCVInsertReadWriteCSRPass(*PR);
initializeRISCVInsertWriteVXRMPass(*PR);
initializeRISCVDAGToDAGISelPass(*PR);
@@ -394,6 +395,7 @@ FunctionPass *RISCVPassConfig::createRVVRegAllocPass(bool Optimized) {
bool RISCVPassConfig::addRegAssignAndRewriteFast() {
if (EnableSplitRegAlloc)
addPass(createRVVRegAllocPass(false));
+ addPass(createRISCVCoalesceVSETVLIPass());
return TargetPassConfig::addRegAssignAndRewriteFast();
}
@@ -402,6 +404,7 @@ bool RISCVPassConfig::addRegAssignAndRewriteOptimized() {
addPass(createRVVRegAllocPass(true));
addPass(createVirtRegRewriter(false));
}
+ addPass(createRISCVCoalesceVSETVLIPass());
return TargetPassConfig::addRegAssignAndRewriteOptimized();
}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 8e214e40547832..9e83efd3519539 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1407,8 +1407,8 @@ define <8 x float> @buildvec_v8f32_zvl256(float %e0, float %e1, float %e2, float
; CHECK-NEXT: vfmv.v.f v8, fa4
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
; CHECK-NEXT: vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%v0 = insertelement <8 x float> poison, float %e0, i64 0
@@ -1458,8 +1458,8 @@ define <8 x double> @buildvec_v8f64_zvl512(double %e0, double %e1, double %e2, d
; CHECK-NEXT: vfmv.v.f v8, fa4
; CHECK-NEXT: vfslide1down.vf v8, v8, fa5
; CHECK-NEXT: vfslide1down.vf v8, v8, fa6
-; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vfslide1down.vf v8, v8, fa7
+; CHECK-NEXT: vmv.v.i v0, 15
; CHECK-NEXT: vslidedown.vi v8, v9, 4, v0.t
; CHECK-NEXT: ret
%v0 = insertelement <8 x double> poison, double %e0, i64 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 6bfd0ac932672f..ed152e64a91ef4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -57,8 +57,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
; RV32-V512-NEXT: vid.v v10
; RV32-V512-NEXT: vsrl.vi v11, v10, 1
; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT: vmv.v.i v0, 10
; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT: vmv.v.i v0, 10
; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t
; RV32-V512-NEXT: vmv.v.v v8, v10
; RV32-V512-NEXT: ret
@@ -68,8 +68,8 @@ define <4 x double> @interleave_v2f64(<2 x double> %x, <2 x double> %y) {
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vid.v v10
; RV64-V512-NEXT: vsrl.vi v11, v10, 1
-; RV64-V512-NEXT: vmv.v.i v0, 10
; RV64-V512-NEXT: vrgather.vv v10, v8, v11
+; RV64-V512-NEXT: vmv.v.i v0, 10
; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; RV64-V512-NEXT: vmv.v.v v8, v10
; RV64-V512-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
index 85b849045e8cee..a8e4af2d7368e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i-sat.ll
@@ -395,8 +395,8 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
; RV32-NEXT: fmin.d fa5, fa5, fa4
; RV32-NEXT: fcvt.w.d a2, fa5, rtz
; RV32-NEXT: and a0, a0, a2
-; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vslide1down.vx v9, v9, a0
+; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
; RV32-NEXT: vse8.v v9, (a1)
; RV32-NEXT: addi sp, s0, -128
@@ -496,8 +496,8 @@ define void @fp2si_v8f64_v8i8(ptr %x, ptr %y) {
; RV64-NEXT: fmin.d fa5, fa5, fa4
; RV64-NEXT: fcvt.l.d a2, fa5, rtz
; RV64-NEXT: and a0, a0, a2
-; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vslide1down.vx v9, v9, a0
+; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
; RV64-NEXT: vse8.v v9, (a1)
; RV64-NEXT: addi sp, s0, -128
@@ -580,8 +580,8 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
; RV32-NEXT: fmax.d fa4, fa4, fa3
; RV32-NEXT: fmin.d fa5, fa4, fa5
; RV32-NEXT: fcvt.wu.d a0, fa5, rtz
-; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vslide1down.vx v9, v9, a0
+; RV32-NEXT: vmv.v.i v0, 15
; RV32-NEXT: vslidedown.vi v9, v8, 4, v0.t
; RV32-NEXT: vse8.v v9, (a1)
; RV32-NEXT: addi sp, s0, -128
@@ -656,8 +656,8 @@ define void @fp2ui_v8f64_v8i8(ptr %x, ptr %y) {
; RV64-NEXT: fmax.d fa4, fa4, fa3
; RV64-NEXT: fmin.d fa5, fa4, fa5
; RV64-NEXT: fcvt.lu.d a0, fa5, rtz
-; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vslide1down.vx v9, v9, a0
+; RV64-NEXT: vmv.v.i v0, 15
; RV64-NEXT: vslidedown.vi v9, v8, 4, v0.t
; RV64-NEXT: vse8.v v9, (a1)
; RV64-NEXT: addi sp, s0, -128
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 6da83644413bc2..40ff8b50d99d8d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -70,8 +70,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
; RV32-V512-NEXT: vid.v v10
; RV32-V512-NEXT: vsrl.vi v11, v10, 1
; RV32-V512-NEXT: vsetvli zero, zero, e64, m1, ta, mu
-; RV32-V512-NEXT: vmv.v.i v0, 10
; RV32-V512-NEXT: vrgatherei16.vv v10, v8, v11
+; RV32-V512-NEXT: vmv.v.i v0, 10
; RV32-V512-NEXT: vrgatherei16.vv v10, v9, v11, v0.t
; RV32-V512-NEXT: vmv.v.v v8, v10
; RV32-V512-NEXT: ret
@@ -81,8 +81,8 @@ define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
; RV64-V512-NEXT: vsetivli zero, 4, e64, m1, ta, mu
; RV64-V512-NEXT: vid.v v10
; RV64-V512-NEXT: vsrl.vi v11, v10, 1
-; RV64-V512-NEXT: vmv.v.i v0, 10
; RV64-V512-NEXT: vrgather.vv v10, v8, v11
+; RV64-V512-NEXT: vmv.v.i v0, 10
; RV64-V512-NEXT: vrgather.vv v10, v9, v11, v0.t
; RV64-V512-NEXT: vmv.v.v v8, v10
; RV64-V512-NEXT: ret
@@ -195,8 +195,8 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V128-NEXT: vsetivli zero, 4, e32, m1, ta, mu
; V128-NEXT: vid.v v8
; V128-NEXT: vsrl.vi v8, v8, 1
-; V128-NEXT: vmv.v.i v0, 10
; V128-NEXT: vadd.vi v8, v8, 1
+; V128-NEXT: vmv.v.i v0, 10
; V128-NEXT: vrgather.vv v10, v9, v8, v0.t
; V128-NEXT: vmv.v.v v8, v10
; V128-NEXT: ret
@@ -210,8 +210,8 @@ define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
; V512-NEXT: vsetivli zero, 4, e32, mf2, ta, mu
; V512-NEXT: vid.v v8
; V512-NEXT: vsrl.vi v8, v8, 1
-; V512-NEXT: vmv.v.i v0, 10
; V512-NEXT: vadd.vi v8, v8, 1
+; V512-NEXT: vmv.v.i v0, 10
; V512-NEXT: vrgather.vv v10, v9, v8, v0.t
; V512-NEXT: vmv1r.v v8, v10
; V512-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index 0e8d9cf0306690..58af6ac246d161 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -89,8 +89,8 @@ define <4 x i16> @vrgather_shuffle_vv_v4i16(<4 x i16> %x, <4 x i16> %y) {
; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0)
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v11, (a0)
-; CHECK-NEXT: vmv.v.i v0, 8
; CHECK-NEXT: vrgather.vv v10, v8, v11
+; CHECK-NEXT: vmv.v.i v0, 8
; CHECK-NEXT: vrgather.vi v10, v9, 1, v0.t
; CHECK-NEXT: vmv1r.v v8, v10
; CHECK-NEXT: ret
@@ -162,16 +162,16 @@ define <8 x i64> @vrgather_shuffle_vv_v8i64(<8 x i64> %x, <8 x i64> %y) {
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; RV32-NEXT: vmv.v.i v16, 2
-; RV32-NEXT: li a0, 5
-; RV32-NEXT: vslide1down.vx v20, v16, a0
; RV32-NEXT: lui a0, %hi(.LCPI11_0)
; RV32-NEXT: a...
[truncated]
|
@@ -27,7 +27,7 @@ body: | | |||
; CHECK-NEXT: $x12 = frame-setup SLLI killed $x12, 3 | |||
; CHECK-NEXT: $x2 = frame-setup SUB $x2, killed $x12 | |||
; CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 | |||
; CHECK-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x11, 152 /* e64, m1, tu, ma */, implicit-def $vl, implicit-def $vtype | |||
; CHECK-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x11, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This diff is caused by returning true for isUndef()
in hasUndefinedMergeOp
Does this change not impact |
Dbg value will be emited during the register allocation pass. If we insert the other pass between register allocation stage, then the LiveDebugVariables will raise the assertion when destruct without emit Dbg value.
For example:
with this patch. A workaround is |
; CHECK-NEXT: slli a0, a0, 3 | ||
; CHECK-NEXT: add a0, sp, a0 | ||
; CHECK-NEXT: addi a0, a0, 16 | ||
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Additional spill code here, likely caused by an extra vsetvli dependence during machine scheduling. There is a possibility that the scheduler itself will be able to fix this issue once we move the vsetvl insertion pass into postRA completely.
; without patch
200B %11:vrm2nov0 = PseudoVSRL_VI_M2 undef %11:vrm2nov0(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
; with patch
160B %11:vrm2nov0 = PseudoVSRL_VI_M2 undef %11:vrm2nov0(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
; Without patch
# *** IR Dump After Machine Instruction Scheduler (machine-scheduler) ***:
# Machine code for function vector_interleave_nxv16i64_nxv8i64: NoPHIs, TracksLiveness, TiedOpsRewritten
Function Live Ins: $v8m8 in %0, $v16m8 in %1
0B bb.0 (%ir-block.0):
liveins: $v8m8, $v16m8
16B %16:vrm8 = COPY $v16m8
32B %0:vrm8 = COPY $v8m8
96B %5:gpr = PseudoReadVLENB
112B %6:gpr = SRLI %5:gpr, 1
128B dead %18:gpr = PseudoVSETVLIX0 $x0, 73, implicit-def $vl, implicit-def $vtype
144B %7:vrm2 = PseudoVID_V_M2 undef %7:vrm2(tied-def 0), -1, 4, 1, implicit $vl, implicit $vtype
176B %9:vrm2 = PseudoVAND_VI_M2 undef %9:vrm2(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
192B early-clobber %10:vr = PseudoVMSNE_VI_M2 %9:vrm2, 0, -1, 4, implicit $vl, implicit $vtype
200B %11:vrm2nov0 = PseudoVSRL_VI_M2 undef %11:vrm2nov0(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
256B %12:vrm8 = COPY %0:vrm8
260B $v0 = COPY %10:vr
264B %11:vrm2nov0 = PseudoVADD_VX_M2_MASK %11:vrm2nov0(tied-def 0), %11:vrm2nov0, %6:gpr, $v0, -1, 4, 1, implicit $vl, implicit $vtype
272B %12.sub_vrm4_1:vrm8 = COPY %16.sub_vrm4_0:vrm8
288B dead $x0 = PseudoVSETVLIX0 killed $x0, 219, implicit-def $vl, implicit-def $vtype, implicit $vl
304B early-clobber %13:vrm8 = PseudoVRGATHEREI16_VV_M8_E64_M2 undef %13:vrm8(tied-def 0), %12:vrm8, %11:vrm2nov0, -1, 6, 1, implicit $vl, implicit $vtype
320B %16.sub_vrm4_0:vrm8 = COPY %0.sub_vrm4_1:vrm8
368B early-clobber %17:vrm8 = PseudoVRGATHEREI16_VV_M8_E64_M2 undef %17:vrm8(tied-def 0), %16:vrm8, %11:vrm2nov0, -1, 6, 1, implicit $vl, implicit $vtype
384B $v8m8 = COPY %13:vrm8
400B $v16m8 = COPY %17:vrm8
416B PseudoRET implicit $v8m8, implicit $v16m8
; With patch
# *** IR Dump After Machine Instruction Scheduler (machine-scheduler) ***:
# Machine code for function vector_interleave_nxv16i64_nxv8i64: NoPHIs, TracksLiveness, TiedOpsRewritten
Function Live Ins: $v8m8 in %0, $v16m8 in %1
0B bb.0 (%ir-block.0):
liveins: $v8m8, $v16m8
16B %16:vrm8 = COPY $v16m8
32B %0:vrm8 = COPY $v8m8
96B %5:gpr = PseudoReadVLENB
112B %6:gpr = SRLI %5:gpr, 1
128B dead %18:gpr = PseudoVSETVLIX0 $x0, 201, implicit-def $vl, implicit-def $vtype
144B %7:vrm2 = PseudoVID_V_M2 undef %7:vrm2(tied-def 0), -1, 4, 1, implicit $vl, implicit $vtype
160B %11:vrm2nov0 = PseudoVSRL_VI_M2 undef %11:vrm2nov0(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
176B %9:vrm2 = PseudoVAND_VI_M2 undef %9:vrm2(tied-def 0), %7:vrm2, 1, -1, 4, 3, implicit $vl, implicit $vtype
192B early-clobber %10:vr = PseudoVMSNE_VI_M2 %9:vrm2, 0, -1, 4, implicit $vl, implicit $vtype
224B dead $x0 = PseudoVSETVLIX0 killed $x0, 73, implicit-def $vl, implicit-def $vtype, implicit $vl
272B %12:vrm8 = COPY %0:vrm8
276B $v0 = COPY %10:vr
280B %11:vrm2nov0 = PseudoVADD_VX_M2_MASK %11:vrm2nov0(tied-def 0), %11:vrm2nov0, %6:gpr, $v0, -1, 4, 1, implicit $vl, implicit $vtype
288B %12.sub_vrm4_1:vrm8 = COPY %16.sub_vrm4_0:vrm8
304B dead $x0 = PseudoVSETVLIX0 killed $x0, 219, implicit-def $vl, implicit-def $vtype, implicit $vl
320B early-clobber %13:vrm8 = PseudoVRGATHEREI16_VV_M8_E64_M2 undef %13:vrm8(tied-def 0), %12:vrm8, %11:vrm2nov0, -1, 6, 1, implicit $vl, implicit $vtype
336B %16.sub_vrm4_0:vrm8 = COPY %0.sub_vrm4_1:vrm8
384B early-clobber %17:vrm8 = PseudoVRGATHEREI16_VV_M8_E64_M2 undef %17:vrm8(tied-def 0), %16:vrm8, %11:vrm2nov0, -1, 6, 1, implicit $vl, implicit $vtype
400B $v8m8 = COPY %13:vrm8
416B $v16m8 = COPY %17:vrm8
432B PseudoRET implicit $v8m8, implicit $v16m8
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Additional spill code here, likely caused by an extra vsetvli dependence during machine scheduling. There is a possibility that the scheduler itself will be able to fix this issue once we move the vsetvl insertion pass into postRA completely.
Yes, that was my thoughts as well. For what its worth there doesn't seem to be much extra spilling in practice, on llvm test suite with SPEC CPU 2017 there's actually slightly less spills overall. But the difference is so small I would just chalk it up to noise
regalloc.NumSpills
l/r lhs rhs diff
count 2067.000000 2067.000000 278.000000
mean 88.670053 88.665215 0.000010
std 1155.298479 1155.264766 0.000260
min 0.000000 0.000000 -0.001185
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000
75% 0.000000 0.000000 0.000000
max 44054.000000 44054.000000 0.004115
Woops yes it does, thanks, should be updated now. |
Thanks, that makes sense. I was curious as to how AMDGPU handled this since it also splits regalloc and inserts a pass in between, but it looks like said pass is marked as
|
3a1b372
to
0a92d90
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Focused purely on the extra spill question - code review to follow separately)
Focusing on the specific example presented, I think it's important to note that we have multiple improvements possible in this sequence.
The most important one is that we can split this shuffle and entirely eliminate the high LMUL constrained problem. This is computing a interleave. From first principles, we know that each output VREG reads exactly two input VREGs, and we know exactly which ones. This is a case where we can split a high LMUL vrgather into a linear number of small-LMUL shuffles even without knowing the exact value of VLEN. Doing so would give the register allocator significantly more flexibility here.
Do we have other real examples? Based on Luke's numbers, this doesn't seem to happen frequently to begin with. If all of our examples can be independently improved in practice, I think we can safely ignore this.
; CHECK-NEXT: vmv1r.v v0, v10 | ||
; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t | ||
; CHECK-NEXT: vsrl.vi v6, v24, 1 | ||
; CHECK-NEXT: vand.vi v8, v24, 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's something a bit odd here with both the before and after code. This vand.vi appears to be the last use of the v24 register. Why are we using v8 as a temporary here at all? I don't believe vand.vi has a register overlap constraint... If we reused v24 here, we'd reduce register pressure slightly.
; CHECK-NEXT: add a1, sp, a1 | ||
; CHECK-NEXT: addi a1, a1, 16 | ||
; CHECK-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload | ||
; CHECK-NEXT: vadd.vx v6, v6, a0, v0.t |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, interestingly we have a srl by 1 on both the vector (v6) and scalar (a0) inputs here. By sinking it, we could reduce register pressure by one.
Also of note is that this add is really a disjoint or. We probably should be canonicalizing to or here, and the fact we're not hints at either missing known bits or a missing combine. I don't expect this to have any practical impact in this case. (Though, it might enable that shift combine just mentioned.)
@BeMg I looked at some of the diffs on SPEC CPU 2017 more closely, there are only two extra spills and at least one of them is a scalar spill, so I'm not worried about it. I can see some differences with regards to scheduling though, it mainly seems to be where we're no longer able to move instructions above a load or store, here's one such example: --- build.a/External/SPEC/CFP2017rate/526.blender_r/CMakeFiles/526.blender_r.dir/root/cpu2017/benchspec/CPU/526.blender_
r/src/blender/source/blender/blenkernel/intern/seqeffects.s 2024-04-10 16:43:45.181903504 +0000
+++ build.b/External/SPEC/CFP2017rate/526.blender_r/CMakeFiles/526.blender_r.dir/root/cpu2017/benchspec/CPU/526.blender_
r/src/blender/source/blender/blenkernel/intern/seqeffects.s 2024-04-10 16:43:56.813588028 +0000
@@ -9253,30 +9253,30 @@
vsetivli zero, 1, e8, mf8, ta, ma
vmv.v.i v8, 3
vsetivli zero, 8, e32, m2, ta, mu
- vmv1r.v v0, v8
- vfmerge.vfm v14, v12, fa3, v0
- vfmv.v.f v10, fa4
- vfmerge.vfm v16, v10, fa5, v0
.Lpcrel_hi169:
auipc t0, %pcrel_hi(.LCPI57_0)
addi t0, t0, %pcrel_lo(.Lpcrel_hi169)
- vle16.v v18, (t0)
+ vle16.v v14, (t0)
+ vmv1r.v v0, v8
+ vfmerge.vfm v16, v12, fa3, v0
+ vfmv.v.f v10, fa4
+ vfmerge.vfm v18, v10, fa5, v0
+ vrgatherei16.vv v10, v18, v14 As you say, I think this is caused by an invisible vsetvli after the vle16.v thats coalesced in the postpass, but temporarily exists before pre-regalloc occurs. So hopefully not an issue if we finish moving the rest of RISCVInsertVSETVLI to post-regalloc. |
@@ -1510,7 +1545,10 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI, | |||
|
|||
auto &AVL = MI.getOperand(1); | |||
auto &PrevAVL = PrevMI.getOperand(1); | |||
assert(MRI.isSSA()); | |||
assert(!AVL.isReg() || !AVL.getReg().isVirtual() || | |||
MRI.hasOneDef(AVL.getReg())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is AVL and PrevAVL always OneDef?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Apparently so, but I will double check. It could just be that we don't have tests for AVLs coming from a phi?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I was able to create an .mir test case that fails this assertion at least. I think it should be ok if we just return false if there's more than one def?
|
||
// DefReg may have had no uses, in which case we need to shrink | ||
// the LiveInterval up to MI. | ||
LIS.shrinkToUses(&DefLI); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function come from llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.ll
, and I found it LiveInterval weight compute different between createAndComputeVirtRegInterval and manual fix approach.
; llc -mtriple=riscv64 -mattr=+m,+f,+d,+a,+c,+v -target-abi=lp64d -verify-machineinstrs -O2
define i64 @bad_removal(<2 x i64> %arg) {
bb:
%tmp = extractelement <2 x i64> %arg, i64 0
%tmp1 = call i64 @llvm.riscv.vsetvli.i64(i64 16, i64 3, i64 0)
%tmp2 = add i64 %tmp, %tmp1
ret i64 %tmp2
}
; Manual fix LI
; First Greedy RA
%2 [64r,80r:0) 0@64r weight:INF
;; This pass update %2
; Second Greedy RA
%2 [32r,80r:0) 0@32r weight:INF
; createAndComputeVirtRegInterval
; First Greedy RA
%2 [64r,80r:0) 0@64r weight:INF
;; This pass update %2
; Second Greedy RA
%2 [32r,80r:0) 0@32r weight:4.464286e-03
I thought the major reason is manual fix approach inherit the weight from old LI, but the old LI be mark as NotSpillable due to it small enough. In the second run, it will not recompute the weight for unspillable register although although LI be changed already.
During fixes up manually, resetting the LI weight could be a solution to this situation.
DefLI.setWeight(0);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for catching this, resetting the weight to 0 does the trick. I can't find any other targets that work around this, but I presume this is a snag that we only run into when doing split RA.
f460113
to
bc3fcda
Compare
Refactored to remove code motion |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…r regalloc This patch splits off part of the work to move vsetvli insertion to post regalloc in llvm#70549. The doLocalPostpass operates outside of RISCVInsertVSETVLis dataflow, so we can move it to its own pass. We can then move it to post vector regalloc, which should be a smaller change since it only touches GPR registers. A couple of things that are different from llvm#70549: - This manually fixes up the LiveIntervals rather than recomputing it via createAndComputeVirtRegInterval. I'm not sure if there's much of a difference with either. - For the postpass it's sufficient enough to just check isUndef() in hasUndefinedMergeOp, i.e. we don't need to lookup the def in VNInfo. Running on llvm-test-suite and SPEC CPU 2017 there aren't any changes in the number of vsetvls removed. We could also potentially turn off this pass for unoptimised builds.
The weight will be recalculated so we can just stick in 0.
9f0e8a3
to
918399f
Compare
After this patch, additional
Before:
After:
See dtcxzyw/llvm-codegen-benchmark@0b07efe for more cases. |
This is to be expected as we now have some extra vsetvlis that live through vector regalloc (but are then coalesced before scalar regalloc), so the live ranges may be slightly longer. For what it's worth I don't believe this is a serious regression, there's some other places where we remove copies as well. Scheduling and vector regalloc should significantly improve once #70549 lands |
I'm planning on reverting this by the way, there's an address sanitizer failure on one of the buildbots. |
…st vector regalloc (#88295)" Seems to cause an address sanitizer failure on one of the buildbots related to live intervals.
…ost vector regalloc (#88295)" The original commit was calling shrinkToUses on an interval for a virtual register whose def was erased. This fixes it by calling shrinkToUses first and removing the interval if we erase the old VL def.
…rtReadVL In llvm#88295 we split out coalesceVSETVLIs (f.k.a doLocalPostpass) into a separate pass, which meant moving it past the call to insertReadVL. Whenever we merged it back in llvm#92869, we accidentally moved it after insertReadVL. This patch moves it back to its original position.
This patch splits off part of the work to move vsetvli insertion to post regalloc in #70549.
The doLocalPostpass operates outside of RISCVInsertVSETVLI's dataflow, so we can move it to its own pass. We can then move it to post vector regalloc which should be a smaller change.
A couple of things that are different from #70549:
This manually fixes up the LiveIntervals rather than recomputing it via createAndComputeVirtRegInterval. I'm not sure if there's much of a difference with either.
For the postpass it's sufficient enough to just check isUndef() in hasUndefinedMergeOp, i.e. we don't need to lookup the def in VNInfo.
Running on llvm-test-suite and SPEC CPU 2017 there aren't any changes in the number of vsetvlis removed. There are some minor scheduling diffs as well as extra spills and less spills in some cases (caused by transient vsetvlis existing between RISCVInsertVSETVLI and RISCVCoalesceVSETVLI when vec regalloc happens), but they are minor and should go away once we finish moving the rest of RISCVInsertVSETVLI.
We could also potentially turn off this pass for unoptimised builds.