Skip to content

[RISCV] Fold vmv.x.s into load from stack #109774

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,29 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
LoadOpc = RISCV::LBU;
break;
}
if (RISCV::getRVVMCOpcode(MI.getOpcode()) == RISCV::VMV_X_S) {
unsigned Log2SEW =
MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
if (STI.getXLen() < (1 << Log2SEW))
return nullptr;
switch (Log2SEW) {
case 3:
LoadOpc = RISCV::LB;
break;
case 4:
LoadOpc = RISCV::LH;
break;
case 5:
LoadOpc = RISCV::LW;
break;
case 6:
LoadOpc = RISCV::LD;
break;
default:
llvm_unreachable("Unexpected SEW");
}
break;
}
return nullptr;
case RISCV::SEXT_H:
LoadOpc = RISCV::LH;
Expand Down
162 changes: 162 additions & 0 deletions llvm/test/CodeGen/RISCV/rvv/stack-folding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV32 %s
; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck --check-prefixes=CHECK,RV64 %s

define i64 @i64(<vscale x 1 x i64> %v, i1 %c) {
; RV32-LABEL: i64:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: csrr a1, vlenb
; RV32-NEXT: slli a1, a1, 1
; RV32-NEXT: sub sp, sp, a1
; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; RV32-NEXT: addi a1, sp, 16
; RV32-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; RV32-NEXT: andi a0, a0, 1
; RV32-NEXT: #APP
; RV32-NEXT: #NO_APP
; RV32-NEXT: beqz a0, .LBB0_2
; RV32-NEXT: # %bb.1: # %truebb
; RV32-NEXT: li a0, 32
; RV32-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload
; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
; RV32-NEXT: vsrl.vx v8, v9, a0
; RV32-NEXT: vmv.x.s a1, v8
; RV32-NEXT: vmv.x.s a0, v9
; RV32-NEXT: j .LBB0_3
; RV32-NEXT: .LBB0_2: # %falsebb
; RV32-NEXT: li a1, 0
; RV32-NEXT: .LBB0_3: # %falsebb
; RV32-NEXT: csrr a2, vlenb
; RV32-NEXT: slli a2, a2, 1
; RV32-NEXT: add sp, sp, a2
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: i64:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: .cfi_def_cfa_offset 16
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 1
; RV64-NEXT: sub sp, sp, a1
; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; RV64-NEXT: addi a1, sp, 16
; RV64-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; RV64-NEXT: andi a0, a0, 1
; RV64-NEXT: #APP
; RV64-NEXT: #NO_APP
; RV64-NEXT: beqz a0, .LBB0_2
; RV64-NEXT: # %bb.1: # %truebb
; RV64-NEXT: ld a0, 16(sp) # 8-byte Folded Reload
; RV64-NEXT: .LBB0_2: # %falsebb
; RV64-NEXT: csrr a1, vlenb
; RV64-NEXT: slli a1, a1, 1
; RV64-NEXT: add sp, sp, a1
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: ret
tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
br i1 %c, label %truebb, label %falsebb
truebb:
%x = extractelement <vscale x 1 x i64> %v, i32 0
ret i64 %x
falsebb:
ret i64 0
}

define i32 @i32(<vscale x 2 x i32> %v, i1 %c) {
; CHECK-LABEL: i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: beqz a0, .LBB1_2
; CHECK-NEXT: # %bb.1: # %truebb
; CHECK-NEXT: lw a0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: .LBB1_2: # %falsebb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add sp, sp, a1
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
br i1 %c, label %truebb, label %falsebb
truebb:
%x = extractelement <vscale x 2 x i32> %v, i32 0
ret i32 %x
falsebb:
ret i32 0
}

define i16 @i16(<vscale x 4 x i16> %v, i1 %c) {
; CHECK-LABEL: i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple of off topic observations on this test:

  • Why are we increasing stack size by 2 * VLENB when we're only spilling one vector CSE?
  • We could have rescheduled the extractelement to avoid the need to spill entirely.

Copy link
Collaborator

@topperc topperc Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could have rescheduled the extractelement to avoid the need to spill entirely.

The "scheduler" is per basic block. So it would need to be something else that is capable of hoisting to another BB

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we increasing stack size by 2 * VLENB when we're only spilling one vector CSE?

I suspect it has something to do with stack alignment being 16? And that code not factoring in VLEN being known >=128.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's why these tests have the branch, to prevent the scheduler from moving the extractelement above the asm. I couldn't use the store volatile trick that I used in remat.ll since it interferes with the reload.

I have no idea why the stack size is 2 * VLENB though.

Copy link
Collaborator

@topperc topperc Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirmed the 2*VLENB, is due to assignRVVStackObjectOffsets not accounting for MinVLen when calculating alignment padding.

; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: beqz a0, .LBB2_2
; CHECK-NEXT: # %bb.1: # %truebb
; CHECK-NEXT: lh a0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: .LBB2_2: # %falsebb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add sp, sp, a1
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
br i1 %c, label %truebb, label %falsebb
truebb:
%x = extractelement <vscale x 4 x i16> %v, i32 0
ret i16 %x
falsebb:
ret i16 0
}

define i8 @i8(<vscale x 8 x i8> %v, i1 %c) {
; CHECK-LABEL: i8:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a1, sp, 16
; CHECK-NEXT: vs1r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: andi a0, a0, 1
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: beqz a0, .LBB3_2
; CHECK-NEXT: # %bb.1: # %truebb
; CHECK-NEXT: lb a0, 16(sp) # 8-byte Folded Reload
; CHECK-NEXT: .LBB3_2: # %falsebb
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 1
; CHECK-NEXT: add sp, sp, a1
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
tail call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
br i1 %c, label %truebb, label %falsebb
truebb:
%x = extractelement <vscale x 8 x i8> %v, i32 0
ret i8 %x
falsebb:
ret i8 0
}
Loading