-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[RISCV] Fold vmv.x.s into load from stack #109774
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
If a vector is reloaded from the stack to be used in vmv.s.x, we can tell foldMemoryOperandImpl to fold it into a scalar load. If XLEN < SEW then this currently just bails. I couldn't think of a way to express a vmv.s.x that truncates in LLVM IR.
@llvm/pr-subscribers-backend-risc-v Author: Luke Lau (lukel97) ChangesIf a vector is reloaded from the stack to be used in vmv.s.x, we can tell foldMemoryOperandImpl to fold it into a scalar load. If XLEN < SEW then this currently just bails. I couldn't think of a way to express a vmv.s.x that truncates in LLVM IR. Full diff: https://github.com/llvm/llvm-project/pull/109774.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 41f93fde17d329..8210f756f3a249 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -763,6 +763,29 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
LoadOpc = RISCV::LBU;
break;
}
+ if (RISCV::getRVVMCOpcode(MI.getOpcode()) == RISCV::VMV_X_S) {
+ unsigned Log2SEW =
+ MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+ if (STI.getXLen() < (1 << Log2SEW))
+ return nullptr;
+ switch (Log2SEW) {
+ case 3:
+ LoadOpc = RISCV::LB;
+ break;
+ case 4:
+ LoadOpc = RISCV::LH;
+ break;
+ case 5:
+ LoadOpc = RISCV::LW;
+ break;
+ case 6:
+ LoadOpc = RISCV::LD;
+ break;
+ default:
+ llvm_unreachable("Unexpected SEW");
+ }
+ break;
+ }
return nullptr;
case RISCV::SEXT_H:
LoadOpc = RISCV::LH;
diff --git a/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll
new file mode 100644
index 00000000000000..4771d7fe6ec92b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/stack-folding.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV32 %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV64 %s
+
+define i64 @i64(<vscale x 1 x i64> %v, i1 %c) {
+; RV32-LABEL: i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -16
+; RV32-NEXT: .cfi_def_cfa_offset 16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: andi a0, a0, 1
+; RV32-NEXT: #APP
+; RV32-NEXT: #NO_APP
+; RV32-NEXT: beqz a0, .LBB0_2
+; RV32-NEXT: # %bb.1: # %truebb
+; RV32-NEXT: li a0, 32
+; RV32-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vsrl.vx v8, v9, a0
+; RV32-NEXT: vmv.x.s a1, v8
+; RV32-NEXT: vmv.x.s a0, v9
+; RV32-NEXT: j .LBB0_3
+; RV32-NEXT: .LBB0_2: # %falsebb
+; RV32-NEXT: li a1, 0
+; RV32-NEXT: .LBB0_3: # %falsebb
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add sp, sp, a2
+; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: ret
+;
+; RV64-LABEL: i64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -16
+; RV64-NEXT: .cfi_def_cfa_offset 16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: andi a0, a0, 1
+; RV64-NEXT: #APP
+; RV64-NEXT: #NO_APP
+; RV64-NEXT: beqz a0, .LBB0_2
+; RV64-NEXT: # %bb.1: # %truebb
+; RV64-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .LBB0_2: # %falsebb
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add sp, sp, a1
+; RV64-NEXT: addi sp, sp, 16
+; RV64-NEXT: ret
+ tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ br i1 %c, label %truebb, label %falsebb
+truebb:
+ %x = extractelement <vscale x 1 x i64> %v, i32 0
+ ret i64 %x
+falsebb:
+ ret i64 0
+}
+
+define i32 @i32(<vscale x 2 x i32> %v, i1 %c) {
+; CHECK-LABEL: i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: beqz a0, .LBB1_2
+; CHECK-NEXT: # %bb.1: # %truebb
+; CHECK-NEXT: lw a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT: .LBB1_2: # %falsebb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ br i1 %c, label %truebb, label %falsebb
+truebb:
+ %x = extractelement <vscale x 2 x i32> %v, i32 0
+ ret i32 %x
+falsebb:
+ ret i32 0
+}
+
+define i16 @i16(<vscale x 4 x i16> %v, i1 %c) {
+; CHECK-LABEL: i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: beqz a0, .LBB2_2
+; CHECK-NEXT: # %bb.1: # %truebb
+; CHECK-NEXT: lh a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT: .LBB2_2: # %falsebb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ br i1 %c, label %truebb, label %falsebb
+truebb:
+ %x = extractelement <vscale x 4 x i16> %v, i32 0
+ ret i16 %x
+falsebb:
+ ret i16 0
+}
+
+define i8 @i8(<vscale x 8 x i8> %v, i1 %c) {
+; CHECK-LABEL: i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: beqz a0, .LBB3_2
+; CHECK-NEXT: # %bb.1: # %truebb
+; CHECK-NEXT: lb a0, 16(sp) # 8-byte Folded Reload
+; CHECK-NEXT: .LBB3_2: # %falsebb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+ br i1 %c, label %truebb, label %falsebb
+truebb:
+ %x = extractelement <vscale x 8 x i8> %v, i32 0
+ ret i8 %x
+falsebb:
+ ret i8 0
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
; CHECK: # %bb.0: | ||
; CHECK-NEXT: addi sp, sp, -16 | ||
; CHECK-NEXT: .cfi_def_cfa_offset 16 | ||
; CHECK-NEXT: csrr a1, vlenb |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A couple of off topic observations on this test:
- Why are we increasing stack size by 2 * VLENB when we're only spilling one vector CSE?
- We could have rescheduled the extractelement to avoid the need to spill entirely.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could have rescheduled the extractelement to avoid the need to spill entirely.
The "scheduler" is per basic block. So it would need to be something else that is capable of hoisting to another BB
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are we increasing stack size by 2 * VLENB when we're only spilling one vector CSE?
I suspect it has something to do with stack alignment being 16? And that code not factoring in VLEN being known >=128.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's why these tests have the branch, to prevent the scheduler from moving the extractelement above the asm. I couldn't use the store volatile trick that I used in remat.ll since it interferes with the reload.
I have no idea why the stack size is 2 * VLENB though.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confirmed the 2*VLENB, is due to assignRVVStackObjectOffsets
not accounting for MinVLen when calculating alignment padding.
A couple of observations on other cases we can fold. Dropping these here not because they related to the review, but merely cause I need some place to find them in the future.
|
This is the f64/f32 version of llvm#109774. I've left out f16 and bf16 for now because there's a separate issue where we can't select extract_vector_elt when f16/bf16 is a legal type, see llvm#110126.
This is the f64/f32 version of llvm#109774. I've left out f16 and bf16 for now because there's a separate issue where we can't select extract_vector_elt when f16/bf16 is a legal type, see llvm#110126.
After #109774 MSVC is failing to build LLVM with the error: ``` llvm\lib\Target\RISCV\RISCVInstrInfo.cpp(782): warning C4018: '<': signed/unsigned mismatch ``` Fix is ensure that the RHS is an unsigned integer.
If a vector is reloaded from the stack to be used in vmv.x.s, we can tell foldMemoryOperandImpl to fold it into a scalar load.
If XLEN < SEW then this currently just bails. I couldn't think of a way to express a vmv.x.s that truncates in LLVM IR.