From c45fca82fdbd8507e48e38c310514a03f0f373da Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 10 May 2025 18:12:58 +0800 Subject: [PATCH 1/5] Precommit tests --- .../rvv/fixed-vectors-deinterleave-load.ll | 164 ++++++ .../rvv/fixed-vectors-interleave-store.ll | 458 +++++++++++++++++ .../RISCV/rvv/vector-deinterleave-load.ll | 120 +++++ .../RISCV/rvv/vector-interleave-store.ll | 339 +++++++++++++ .../RISCV/rvv/vp-vector-interleaved-access.ll | 476 ++++++++++++++++++ 5 files changed, 1557 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index e53dfc23a84bb..df2a333eecd33 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -257,6 +257,49 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p ret {<2 x double>, <2 x double>} %res1 } +define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: vsetivli zero, 24, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 8 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v10, v8, 16 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v12, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v9, v10 +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg3e8.v v6, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <24 x i8>, ptr %p + %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec) + %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0 + %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1 + %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0 + ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2 +} + define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: @@ -281,6 +324,127 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3 } +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 24 +; CHECK-NEXT: vslidedown.vi v14, v8, 16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v13, v8, 8 +; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v14, v12, a0 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vslideup.vx v12, v13, a0 +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v8, v8, a0 +; CHECK-NEXT: vmv1r.v v13, v14 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vs4r.v v12, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg5e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <40 x i8>, ptr %p + %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec) + %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0 + %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1 + %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2 + %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3 + %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4 +} + +define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: li a1, 56 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 40 +; CHECK-NEXT: li a2, 32 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v16, v8, a1 +; CHECK-NEXT: li a1, 48 +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vslidedown.vx v12, v8, a2 +; CHECK-NEXT: add a2, a0, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v14, v8, 24 +; CHECK-NEXT: vslidedown.vi v18, v8, 16 +; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma +; CHECK-NEXT: vslidedown.vi v13, v8, 8 +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v18, v14, a0 +; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma +; CHECK-NEXT: vslidedown.vx v20, v8, a1 +; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v13, a0 +; CHECK-NEXT: vslideup.vx v12, v16, a0 +; CHECK-NEXT: vmv1r.v v9, v18 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vmv1r.v v13, v20 +; CHECK-NEXT: vmv2r.v v10, v12 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load <56 x i8>, ptr %p + %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec) + %t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0 + %t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1 + %t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2 + %t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3 + %t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4 + %t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5 + %t6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 6 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2 + %res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3 + %res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4 + %res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t5, 5 + %res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t6, 6 + ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6 +} + define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index 26c3db6131034..e4dac215b893a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -181,6 +181,138 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> % ret void } +define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor3: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg3e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v9, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v10, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 6 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 3 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor3: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg3e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v9, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v10, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 6 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v8, (a1) +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vl1re32.v v10, (a3) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vl1re32.v v12, (a2) +; CHECK-NEXT: vsetivli zero, 12, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <12 x i32> @llvm.vector.interleave3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) + store <12 x i32> %v, ptr %p + ret void +} + define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: @@ -194,6 +326,332 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3 ret void } +define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor5: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a2, a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: add a3, a1, a2 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg5e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v11, (a4) +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v12, (a4) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 10 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor5: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a2, a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a3, a1, a2 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg5e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v11, (a4) +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v12, (a4) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 10 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 2 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 5 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a5, a4, a2 +; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a1) +; CHECK-NEXT: add a2, a5, a2 +; CHECK-NEXT: vl1re32.v v10, (a5) +; CHECK-NEXT: li a5, 32 +; CHECK-NEXT: vl1re32.v v12, (a4) +; CHECK-NEXT: vl1re32.v v14, (a3) +; CHECK-NEXT: vl1re32.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: vslideup.vi v8, v14, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v12, 8 +; CHECK-NEXT: vl1re32.v v16, (a2) +; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 16 +; CHECK-NEXT: vsetivli zero, 20, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: add a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <20 x i32> @llvm.vector.interleave5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e) + store <20 x i32> %v, ptr %p + ret void +} + +define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor7: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 4 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: add a3, a1, a2 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg7e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v11, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: add a1, a4, a2 +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v12, (a4) +; RV32-NEXT: vl1re32.v v13, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v14, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 14 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 4 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor7: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a3, a1, a2 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg7e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v11, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: add a1, a4, a2 +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v12, (a4) +; RV64-NEXT: vl1re32.v v13, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v14, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 14 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 4 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: sub a1, a2, a1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 7 * vlenb +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a3, vlenb +; CHECK-NEXT: add a2, a1, a3 +; CHECK-NEXT: add a4, a2, a3 +; CHECK-NEXT: add a5, a4, a3 +; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg7e32.v v8, (a1) +; CHECK-NEXT: vl1re32.v v14, (a5) +; CHECK-NEXT: add a5, a5, a3 +; CHECK-NEXT: vl1re32.v v12, (a4) +; CHECK-NEXT: add a4, a5, a3 +; CHECK-NEXT: add a3, a4, a3 +; CHECK-NEXT: vl1re32.v v10, (a4) +; CHECK-NEXT: vl1re32.v v8, (a5) +; CHECK-NEXT: vl1re32.v v16, (a3) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v8, v16, 8 +; CHECK-NEXT: vl1re32.v v16, (a1) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vslideup.vi v12, v14, 4 +; CHECK-NEXT: vl1re32.v v14, (a2) +; CHECK-NEXT: vslideup.vi v16, v14, 4 +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; CHECK-NEXT: vslideup.vi v16, v12, 8 +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vslideup.vi v16, v8, 16 +; CHECK-NEXT: vsetivli zero, 28, e32, m8, ta, ma +; CHECK-NEXT: vse32.v v16, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %v = call <28 x i32> @llvm.vector.interleave7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g) + store <28 x i32> %v, ptr %p + ret void +} + define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 582aef908964a..be8deb1319c36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -344,6 +344,42 @@ define {, } @vector_deinterleave_load_nxv2p0 ret {, } %res1 } +define { , , } @vector_deinterleave_load_factor3(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg3e8.v v6, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call {, , } @llvm.vector.deinterleave3( %vec) + %t0 = extractvalue {, , } %d0, 0 + %t1 = extractvalue {, , } %d0, 1 + %t2 = extractvalue {, , } %d0, 2 + %res0 = insertvalue { , , } poison, %t0, 0 + %res1 = insertvalue { , , } %res0, %t1, 0 + %res2 = insertvalue { , , } %res1, %t2, 0 + ret { , , } %res2 +} + define { , , , } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: @@ -368,6 +404,90 @@ define { , , , , , , } %res3 } +define { , , , , } @vector_deinterleave_load_factor5(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 2 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg5e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call {, , , , } @llvm.vector.deinterleave5( %vec) + %t0 = extractvalue { , , , , } %d0, 0 + %t1 = extractvalue { , , , , } %d0, 1 + %t2 = extractvalue { , , , , } %d0, 2 + %t3 = extractvalue { , , , , } %d0, 3 + %t4 = extractvalue { , , , , } %d0, 4 + %res0 = insertvalue { , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , } %res3, %t4, 4 + ret { , , , , } %res4 +} + +define { , , , , , , } @vector_deinterleave_load_factor7(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: sub a2, a2, a1 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = load , ptr %p + %d0 = call {, , , , , , } @llvm.vector.deinterleave7( %vec) + %t0 = extractvalue { , , , , , , } %d0, 0 + %t1 = extractvalue { , , , , , , } %d0, 1 + %t2 = extractvalue { , , , , , , } %d0, 2 + %t3 = extractvalue { , , , , , , } %d0, 3 + %t4 = extractvalue { , , , , , , } %d0, 4 + %t5 = extractvalue { , , , , , , } %d0, 5 + %t6 = extractvalue { , , , , , , } %d0, 6 + %res0 = insertvalue { , , , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , } %res3, %t5, 5 + %res6 = insertvalue { , , , , , , } %res3, %t6, 6 + ret { , , , , , , } %res6 +} + define {, , , , , , , } @vector_deinterleave_load_factor8(ptr %ptr) { ; CHECK-LABEL: vector_deinterleave_load_factor8: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index b5eb312bf5e18..eeb0e9e91ed36 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -239,6 +239,107 @@ define void @vector_interleave_store_nxv4p0_nxv2p0( %a, %a, %b, %c, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor3: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg3e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v9, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v10, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 6 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 3 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor3: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg3e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v9, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v10, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 6 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 3 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %v = call @llvm.vector.interleave3( %a, %b, %c) + store %v, ptr %p + ret void +} + define void @vector_interleave_store_factor4( %a, %b, %c, %d, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor4: ; CHECK: # %bb.0: @@ -252,6 +353,244 @@ define void @vector_interleave_store_factor4( %a, %a, %b, %c, %d, %e, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor5: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: mv a2, a1 +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a2, a2, a1 +; RV32-NEXT: slli a1, a1, 1 +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: add a3, a1, a2 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg5e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v11, (a4) +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v12, (a4) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 10 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add a1, a1, a0 +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor5: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: mv a2, a1 +; RV64-NEXT: slli a1, a1, 2 +; RV64-NEXT: add a2, a2, a1 +; RV64-NEXT: slli a1, a1, 1 +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a3, a1, a2 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg5e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v11, (a4) +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v12, (a4) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 10 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add a1, a1, a0 +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %v = call @llvm.vector.interleave5( %a, %b, %c, %d, %e) + store %v, ptr %p + ret void +} + +define void @vector_interleave_store_factor7( %a, %b, %c, %d, %e, %f, %g, ptr %p) { +; RV32-LABEL: vector_interleave_store_factor7: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -32 +; RV32-NEXT: .cfi_def_cfa_offset 32 +; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: .cfi_offset s0, -8 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 4 +; RV32-NEXT: sub a1, a2, a1 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: add a3, a1, a2 +; RV32-NEXT: add a4, a3, a2 +; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV32-NEXT: vsseg7e32.v v8, (a1) +; RV32-NEXT: vl1re32.v v10, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v11, (a4) +; RV32-NEXT: add a4, a4, a2 +; RV32-NEXT: vl1re32.v v8, (a1) +; RV32-NEXT: add a1, a4, a2 +; RV32-NEXT: vl1re32.v v9, (a3) +; RV32-NEXT: vl1re32.v v12, (a4) +; RV32-NEXT: vl1re32.v v13, (a1) +; RV32-NEXT: add a1, a1, a2 +; RV32-NEXT: vl1re32.v v14, (a1) +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV32-NEXT: mv s0, a0 +; RV32-NEXT: srli a0, a2, 3 +; RV32-NEXT: li a1, 14 +; RV32-NEXT: call __mulsi3 +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV32-NEXT: vse32.v v8, (s0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 4 +; RV32-NEXT: sub a0, a1, a0 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 32 +; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32-NEXT: .cfi_restore ra +; RV32-NEXT: .cfi_restore s0 +; RV32-NEXT: addi sp, sp, 32 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vector_interleave_store_factor7: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -32 +; RV64-NEXT: .cfi_def_cfa_offset 32 +; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: .cfi_offset ra, -8 +; RV64-NEXT: .cfi_offset s0, -16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: sub a1, a2, a1 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 3 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: add a3, a1, a2 +; RV64-NEXT: add a4, a3, a2 +; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma +; RV64-NEXT: vsseg7e32.v v8, (a1) +; RV64-NEXT: vl1re32.v v10, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v11, (a4) +; RV64-NEXT: add a4, a4, a2 +; RV64-NEXT: vl1re32.v v8, (a1) +; RV64-NEXT: add a1, a4, a2 +; RV64-NEXT: vl1re32.v v9, (a3) +; RV64-NEXT: vl1re32.v v12, (a4) +; RV64-NEXT: vl1re32.v v13, (a1) +; RV64-NEXT: add a1, a1, a2 +; RV64-NEXT: vl1re32.v v14, (a1) +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; RV64-NEXT: mv s0, a0 +; RV64-NEXT: srli a0, a2, 3 +; RV64-NEXT: li a1, 14 +; RV64-NEXT: call __muldi3 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma +; RV64-NEXT: vse32.v v8, (s0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 4 +; RV64-NEXT: sub a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 32 +; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: .cfi_restore ra +; RV64-NEXT: .cfi_restore s0 +; RV64-NEXT: addi sp, sp, 32 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %v = call @llvm.vector.interleave7( %a, %b, %c, %d, %e, %f, %g) + store %v, ptr %p + ret void +} + define void @vector_interleave_store_factor8( %a, %b, %c, %d, %e, %f, %g, %h, ptr %p) { ; CHECK-LABEL: vector_interleave_store_factor8: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index d0f35aa8b85e9..97fae479e0cb6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -28,6 +28,68 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % ret { , } %res1 } +define {, , } @load_factor3_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor3_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs4r.v v8, (a0) +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vlseg3e32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor3_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs4r.v v8, (a0) +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vlseg3e32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 3 + %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , , } @llvm.vector.deinterleave3( %wide.masked.load) + %t0 = extractvalue { , , } %deinterleaved.results, 0 + %t1 = extractvalue { , , } %deinterleaved.results, 1 + %t2 = extractvalue { , , } %deinterleaved.results, 2 + %res0 = insertvalue { , , } poison, %t0, 0 + %res1 = insertvalue { , , } %res0, %t1, 1 + %res2 = insertvalue { , , } %res1, %t2, 2 + ret { , , } %res1 +} + define {, , , } @load_factor4_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor4_v2: ; RV32: # %bb.0: @@ -63,6 +125,142 @@ define {, , , , , , } %res3 } +define {, , , , } @load_factor5_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor5_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vlseg5e32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor5_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vlseg5e32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 5 + %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , , , , } @llvm.vector.deinterleave5( %wide.masked.load) + %t0 = extractvalue { , , , , } %deinterleaved.results, 0 + %t1 = extractvalue { , , , , } %deinterleaved.results, 1 + %t2 = extractvalue { , , , , } %deinterleaved.results, 2 + %t3 = extractvalue { , , , , } %deinterleaved.results, 3 + %t4 = extractvalue { , , , , } %deinterleaved.results, 4 + %res0 = insertvalue { , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , } %res3, %t4, 4 + ret { , , , , } %res4 +} + +define {, , , , , , } @load_factor7_v2(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor7_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a2, a2, a1 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) +; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: vlseg7e32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor7_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: subw a2, a2, a1 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: vlseg7e32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 7 + %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , , , , , , } @llvm.vector.deinterleave7( %wide.masked.load) + %t0 = extractvalue { , , , , , , } %deinterleaved.results, 0 + %t1 = extractvalue { , , , , , , } %deinterleaved.results, 1 + %t2 = extractvalue { , , , , , , } %deinterleaved.results, 2 + %t3 = extractvalue { , , , , , , } %deinterleaved.results, 3 + %t4 = extractvalue { , , , , , , } %deinterleaved.results, 4 + %t5 = extractvalue { , , , , , , } %deinterleaved.results, 5 + %t6 = extractvalue { , , , , , , } %deinterleaved.results, 6 + %res0 = insertvalue { , , , , , , } poison, %t0, 0 + %res1 = insertvalue { , , , , , , } %res0, %t1, 1 + %res2 = insertvalue { , , , , , , } %res1, %t2, 2 + %res3 = insertvalue { , , , , , , } %res2, %t3, 3 + %res4 = insertvalue { , , , , , , } %res3, %t4, 4 + %res5 = insertvalue { , , , , , , } %res4, %t5, 5 + %res6 = insertvalue { , , , , , , } %res5, %t6, 6 + ret { , , , , , , } %res6 +} + define {, , , , , , , } @load_factor8_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor8_v2: ; RV32: # %bb.0: @@ -137,6 +335,84 @@ define void @store_factor2_v2( %v0, %v1, pt ret void } +define void @store_factor3_v2( %v0, %v1, %v2, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor3_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 1 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; RV32-NEXT: vsseg3e32.v v8, (a2) +; RV32-NEXT: add a5, a2, a4 +; RV32-NEXT: vle32.v v9, (a5) +; RV32-NEXT: vle32.v v8, (a2) +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: add a2, a3, a3 +; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v8, v9, a3 +; RV32-NEXT: add a4, a5, a4 +; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v9, (a4) +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor3_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 1 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: srli a4, a3, 1 +; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; RV64-NEXT: vsseg3e32.v v8, (a2) +; RV64-NEXT: add a5, a2, a4 +; RV64-NEXT: vle32.v v9, (a5) +; RV64-NEXT: vle32.v v8, (a2) +; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: srli a3, a3, 3 +; RV64-NEXT: add a6, a3, a3 +; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v8, v9, a3 +; RV64-NEXT: add a4, a5, a4 +; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a4) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 3 + %interleaved.vec = call @llvm.vector.interleave3( %v0, %v1, %v2) + call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + define void @store_factor4_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor4_v2: ; RV32: # %bb.0: @@ -165,6 +441,206 @@ define void @store_factor4_v2( %v0, %v1, pt ret void } +define void @store_factor5_v2( %v0, %v1, %v2, %v3, %v4, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor5_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a3, a2, 1 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: add a5, a2, a4 +; RV32-NEXT: add a6, a5, a4 +; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; RV32-NEXT: vsseg5e32.v v8, (a2) +; RV32-NEXT: add a7, a6, a4 +; RV32-NEXT: vle32.v v8, (a7) +; RV32-NEXT: vle32.v v9, (a6) +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: add a6, a3, a3 +; RV32-NEXT: vle32.v v10, (a5) +; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v9, v8, a3 +; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a2) +; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v8, v10, a3 +; RV32-NEXT: add a4, a7, a4 +; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v10, (a4) +; RV32-NEXT: slli a2, a1, 2 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a1, a0, 1 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor5_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 1 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: srli a4, a3, 1 +; RV64-NEXT: add a5, a2, a4 +; RV64-NEXT: add a6, a5, a4 +; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; RV64-NEXT: vsseg5e32.v v8, (a2) +; RV64-NEXT: add a7, a6, a4 +; RV64-NEXT: vle32.v v8, (a7) +; RV64-NEXT: vle32.v v9, (a6) +; RV64-NEXT: srli a3, a3, 3 +; RV64-NEXT: add a6, a3, a3 +; RV64-NEXT: vle32.v v10, (a5) +; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v9, v8, a3 +; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v8, (a2) +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v8, v10, a3 +; RV64-NEXT: add a4, a7, a4 +; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v10, (a4) +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a1, a0, 1 +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 5 + %interleaved.vec = call @llvm.vector.interleave5( %v0, %v1, %v2, %v3, %v4) + call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + +define void @store_factor7_v2( %v0, %v1, %v2, %v3, %v4, %v5, %v6, ptr %ptr, i32 %evl) { +; RV32-LABEL: store_factor7_v2: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 2 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: srli a4, a3, 1 +; RV32-NEXT: srli a3, a3, 3 +; RV32-NEXT: add a5, a2, a4 +; RV32-NEXT: add a6, a5, a4 +; RV32-NEXT: add a7, a6, a4 +; RV32-NEXT: add t0, a7, a4 +; RV32-NEXT: vsetvli t1, zero, e32, mf2, ta, ma +; RV32-NEXT: vsseg7e32.v v8, (a2) +; RV32-NEXT: add t1, t0, a4 +; RV32-NEXT: vle32.v v8, (t1) +; RV32-NEXT: vle32.v v10, (t0) +; RV32-NEXT: add t0, a3, a3 +; RV32-NEXT: add a4, t1, a4 +; RV32-NEXT: vle32.v v12, (a7) +; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v10, v8, a3 +; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v11, (a4) +; RV32-NEXT: vle32.v v9, (a6) +; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v9, v12, a3 +; RV32-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v12, (a5) +; RV32-NEXT: vle32.v v8, (a2) +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a2, a2, a1 +; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV32-NEXT: vslideup.vx v8, v12, a3 +; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 2 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: store_factor7_v2: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 2 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; RV64-NEXT: addi a2, sp, 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: srli a4, a3, 1 +; RV64-NEXT: srli a3, a3, 3 +; RV64-NEXT: add a5, a2, a4 +; RV64-NEXT: add a6, a5, a4 +; RV64-NEXT: add a7, a6, a4 +; RV64-NEXT: add t0, a7, a4 +; RV64-NEXT: vsetvli t1, zero, e32, mf2, ta, ma +; RV64-NEXT: vsseg7e32.v v8, (a2) +; RV64-NEXT: add t1, t0, a4 +; RV64-NEXT: vle32.v v8, (t1) +; RV64-NEXT: vle32.v v10, (t0) +; RV64-NEXT: add t0, a3, a3 +; RV64-NEXT: add a4, t1, a4 +; RV64-NEXT: vle32.v v12, (a7) +; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v10, v8, a3 +; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v11, (a4) +; RV64-NEXT: vle32.v v9, (a6) +; RV64-NEXT: vle32.v v13, (a5) +; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v9, v12, a3 +; RV64-NEXT: vsetvli a4, zero, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v8, (a2) +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: subw a2, a2, a1 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma +; RV64-NEXT: vslideup.vx v8, v13, a3 +; RV64-NEXT: srli a2, a2, 32 +; RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma +; RV64-NEXT: vse32.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 2 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 7 + %interleaved.vec = call @llvm.vector.interleave7( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) + ret void +} + define void @store_factor8_v2( %v0, %v1, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor8_v2: ; RV32: # %bb.0: From a6b4b02440dff9d680f4662fae49355fbb69669f Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Sat, 10 May 2025 18:39:14 +0800 Subject: [PATCH 2/5] [IA] Support [de]interleave{3,5,7} --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 75 ++- .../rvv/fixed-vectors-deinterleave-load.ll | 107 +---- .../rvv/fixed-vectors-interleave-store.ll | 437 +----------------- .../RISCV/rvv/vector-deinterleave-load.ll | 57 --- .../RISCV/rvv/vector-interleave-store.ll | 336 +------------- .../RISCV/rvv/vp-vector-interleaved-access.ll | 416 ++++------------- 6 files changed, 168 insertions(+), 1260 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 04d89d61cb6a9..c590e470fa779 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -571,6 +571,25 @@ bool InterleavedAccessImpl::lowerInterleavedStore( return true; } +static unsigned getIntrinsicFactor(const IntrinsicInst *II) { + switch (II->getIntrinsicID()) { + case Intrinsic::vector_deinterleave2: + case Intrinsic::vector_interleave2: + return 2; + case Intrinsic::vector_deinterleave3: + case Intrinsic::vector_interleave3: + return 3; + case Intrinsic::vector_deinterleave5: + case Intrinsic::vector_interleave5: + return 5; + case Intrinsic::vector_deinterleave7: + case Intrinsic::vector_interleave7: + return 7; + default: + llvm_unreachable("Unexpected intrinsic"); + } +} + // For an (de)interleave tree like this: // // A C B D @@ -586,7 +605,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore( // to reorder them by interleaving these values. static void interleaveLeafValues(MutableArrayRef SubLeaves) { unsigned NumLeaves = SubLeaves.size(); - if (NumLeaves == 2) + if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves)) return; assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1); @@ -608,7 +627,10 @@ static void interleaveLeafValues(MutableArrayRef SubLeaves) { static bool getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_interleave2); + assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 || + II->getIntrinsicID() == Intrinsic::vector_interleave3 || + II->getIntrinsicID() == Intrinsic::vector_interleave5 || + II->getIntrinsicID() == Intrinsic::vector_interleave7); // Visit with BFS SmallVector Queue; @@ -620,7 +642,7 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, // All the intermediate intrinsics will be deleted. DeadInsts.push_back(Current); - for (unsigned I = 0; I < 2; ++I) { + for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) { Value *Op = Current->getOperand(I); if (auto *OpII = dyn_cast(Op)) if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) { @@ -638,9 +660,10 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, } const unsigned Factor = Operands.size(); - // Currently we only recognize power-of-two factors. + // Currently we only recognize factors of 2, 3, 5 and 7. // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) + if (Factor <= 1 || + (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) return false; interleaveLeafValues(Operands); @@ -651,9 +674,12 @@ static bool getVectorDeinterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Results, SmallVectorImpl &DeadInsts) { - assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2); + assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave3 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave5 || + II->getIntrinsicID() == Intrinsic::vector_deinterleave7); using namespace PatternMatch; - if (!II->hasNUses(2)) + if (!II->hasNUses(getIntrinsicFactor(II))) return false; // Visit with BFS @@ -662,12 +688,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, while (!Queue.empty()) { IntrinsicInst *Current = Queue.front(); Queue.erase(Queue.begin()); - assert(Current->hasNUses(2)); + assert(Current->hasNUses(getIntrinsicFactor(Current))); // All the intermediate intrinsics will be deleted from the bottom-up. DeadInsts.insert(DeadInsts.begin(), Current); - ExtractValueInst *LHS = nullptr, *RHS = nullptr; + SmallVector EVs(getIntrinsicFactor(Current), nullptr); for (User *Usr : Current->users()) { if (!isa(Usr)) return 0; @@ -679,17 +705,15 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, if (Indices.size() != 1) return false; - if (Indices[0] == 0 && !LHS) - LHS = EV; - else if (Indices[0] == 1 && !RHS) - RHS = EV; + if (!EVs[Indices[0]]) + EVs[Indices[0]] = EV; else return false; } // We have legal indices. At this point we're either going // to continue the traversal or push the leaf values into Results. - for (ExtractValueInst *EV : {LHS, RHS}) { + for (ExtractValueInst *EV : EVs) { // Continue the traversal. We're playing safe here and matching only the // expression consisting of a perfectly balanced binary tree in which all // intermediate values are only used once. @@ -713,9 +737,10 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, } const unsigned Factor = Results.size(); - // Currently we only recognize power-of-two factors. + // Currently we only recognize factors of 2, 3, 5 and 7. // FIXME: should we assert here instead? - if (Factor <= 1 || !isPowerOf2_32(Factor)) + if (Factor <= 1 || + (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) return 0; interleaveLeafValues(Results); @@ -878,11 +903,23 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { if (auto *II = dyn_cast(&I)) { // At present, we only have intrinsics to represent (de)interleaving - // with a factor of 2. - if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2) + // with a factor of 2,3,5 and 7. + switch (II->getIntrinsicID()) { + case Intrinsic::vector_deinterleave2: + case Intrinsic::vector_deinterleave3: + case Intrinsic::vector_deinterleave5: + case Intrinsic::vector_deinterleave7: Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); - else if (II->getIntrinsicID() == Intrinsic::vector_interleave2) + break; + case Intrinsic::vector_interleave2: + case Intrinsic::vector_interleave3: + case Intrinsic::vector_interleave5: + case Intrinsic::vector_interleave7: Changed |= lowerInterleaveIntrinsic(II, DeadInsts); + break; + default: + break; + } } } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index df2a333eecd33..31529b1783651 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -260,34 +260,8 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor3: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 1 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: vsetivli zero, 24, e8, m2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 8 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v8, 16 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v12, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v9, v10 -; CHECK-NEXT: vs2r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg3e8.v v6, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <24 x i8>, ptr %p %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec) @@ -327,42 +301,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor5: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: li a1, 40 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: add a1, a0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v12, v8, 24 -; CHECK-NEXT: vslidedown.vi v14, v8, 16 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v13, v8, 8 -; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v14, v12, a0 -; CHECK-NEXT: vmv1r.v v12, v8 -; CHECK-NEXT: vslideup.vx v12, v13, a0 -; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v8, a0 -; CHECK-NEXT: vmv1r.v v13, v14 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv2r.v v14, v8 -; CHECK-NEXT: vs4r.v v12, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg5e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <40 x i8>, ptr %p %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec) @@ -382,49 +322,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor7: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: li a1, 56 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 40 -; CHECK-NEXT: li a2, 32 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v16, v8, a1 -; CHECK-NEXT: li a1, 48 -; CHECK-NEXT: srli a0, a0, 1 -; CHECK-NEXT: vslidedown.vx v12, v8, a2 -; CHECK-NEXT: add a2, a0, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma -; CHECK-NEXT: vslidedown.vi v14, v8, 24 -; CHECK-NEXT: vslidedown.vi v18, v8, 16 -; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v13, v8, 8 -; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v18, v14, a0 -; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma -; CHECK-NEXT: vslidedown.vx v20, v8, a1 -; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v13, a0 -; CHECK-NEXT: vslideup.vx v12, v16, a0 -; CHECK-NEXT: vmv1r.v v9, v18 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vmv1r.v v13, v20 -; CHECK-NEXT: vmv2r.v v10, v12 -; CHECK-NEXT: vs4r.v v8, (a0) -; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma ; CHECK-NEXT: vlseg7e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load <56 x i8>, ptr %p %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll index e4dac215b893a..8244db45a7ef2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll @@ -182,131 +182,10 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> % } define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor3: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg3e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v9, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v10, (a1) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 6 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 3 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor3: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg3e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v9, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v10, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 6 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: sub a0, a1, a0 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret ; CHECK-LABEL: vector_interleave_store_factor3: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg3e32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v8, (a1) -; CHECK-NEXT: add a2, a3, a2 -; CHECK-NEXT: vl1re32.v v10, (a3) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vl1re32.v v12, (a2) -; CHECK-NEXT: vsetivli zero, 12, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 1 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <12 x i32> @llvm.vector.interleave3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) store <12 x i32> %v, ptr %p @@ -327,160 +206,10 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3 } define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor5: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: mv a2, a1 -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a2, a2, a1 -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: add a3, a1, a2 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg5e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v10, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v11, (a4) -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v12, (a4) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 10 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: add a1, a1, a0 -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor5: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: mv a2, a1 -; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: add a2, a2, a1 -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a3, a1, a2 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg5e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v10, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v11, (a4) -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v12, (a4) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 10 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: add a1, a1, a0 -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret ; CHECK-LABEL: vector_interleave_store_factor5: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x05, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 5 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: add a3, a1, a2 -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: add a5, a4, a2 -; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg5e32.v v8, (a1) -; CHECK-NEXT: add a2, a5, a2 -; CHECK-NEXT: vl1re32.v v10, (a5) -; CHECK-NEXT: li a5, 32 -; CHECK-NEXT: vl1re32.v v12, (a4) -; CHECK-NEXT: vl1re32.v v14, (a3) -; CHECK-NEXT: vl1re32.v v8, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v10, 4 -; CHECK-NEXT: vslideup.vi v8, v14, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v12, 8 -; CHECK-NEXT: vl1re32.v v16, (a2) -; CHECK-NEXT: vsetvli zero, a5, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 16 -; CHECK-NEXT: vsetivli zero, 20, e32, m8, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 2 -; CHECK-NEXT: add a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <20 x i32> @llvm.vector.interleave5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e) store <20 x i32> %v, ptr %p @@ -488,164 +217,10 @@ define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i3 } define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor7: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 4 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: add a3, a1, a2 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg7e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v10, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v11, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: add a1, a4, a2 -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: vl1re32.v v12, (a4) -; RV32-NEXT: vl1re32.v v13, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v14, (a1) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 14 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 4 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor7: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a3, a1, a2 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg7e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v10, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v11, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: add a1, a4, a2 -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: vl1re32.v v12, (a4) -; RV64-NEXT: vl1re32.v v13, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v14, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 14 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 4 -; RV64-NEXT: sub a0, a1, a0 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret ; CHECK-LABEL: vector_interleave_store_factor7: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: sub a1, a2, a1 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 7 * vlenb -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: add a2, a1, a3 -; CHECK-NEXT: add a4, a2, a3 -; CHECK-NEXT: add a5, a4, a3 -; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma -; CHECK-NEXT: vsseg7e32.v v8, (a1) -; CHECK-NEXT: vl1re32.v v14, (a5) -; CHECK-NEXT: add a5, a5, a3 -; CHECK-NEXT: vl1re32.v v12, (a4) -; CHECK-NEXT: add a4, a5, a3 -; CHECK-NEXT: add a3, a4, a3 -; CHECK-NEXT: vl1re32.v v10, (a4) -; CHECK-NEXT: vl1re32.v v8, (a5) -; CHECK-NEXT: vl1re32.v v16, (a3) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v8, v10, 4 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v8, v16, 8 -; CHECK-NEXT: vl1re32.v v16, (a1) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vslideup.vi v12, v14, 4 -; CHECK-NEXT: vl1re32.v v14, (a2) -; CHECK-NEXT: vslideup.vi v16, v14, 4 -; CHECK-NEXT: li a1, 32 -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vslideup.vi v16, v12, 8 -; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vslideup.vi v16, v8, 16 -; CHECK-NEXT: vsetivli zero, 28, e32, m8, ta, ma -; CHECK-NEXT: vse32.v v16, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a1, a0, 3 -; CHECK-NEXT: sub a0, a1, a0 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg7e32.v v8, (a0) ; CHECK-NEXT: ret %v = call <28 x i32> @llvm.vector.interleave7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g) store <28 x i32> %v, ptr %p diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index be8deb1319c36..0483bbbd35b39 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -347,27 +347,8 @@ define {, } @vector_deinterleave_load_nxv2p0 define { , , } @vector_deinterleave_load_factor3(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor3: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 1 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs4r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg3e8.v v6, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 2 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call {, , } @llvm.vector.deinterleave3( %vec) @@ -407,27 +388,8 @@ define { , , , , , , , } @vector_deinterleave_load_factor5(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor5: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 2 -; CHECK-NEXT: add a1, a2, a1 -; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg5e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call {, , , , } @llvm.vector.deinterleave5( %vec) @@ -447,27 +409,8 @@ define { , , , , , , , , , } @vector_deinterleave_load_factor7(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor7: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 3 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a2, a1, 3 -; CHECK-NEXT: sub a2, a2, a1 -; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma ; CHECK-NEXT: vlseg7e8.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret %vec = load , ptr %p %d0 = call {, , , , , , } @llvm.vector.deinterleave7( %vec) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll index eeb0e9e91ed36..4332ca411d91b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll @@ -240,101 +240,11 @@ define void @vector_interleave_store_nxv4p0_nxv2p0( %a, %a, %b, %c, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor3: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg3e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v9, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v10, (a1) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 6 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 3 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor3: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x07, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 7 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: vsetvli a3, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg3e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v9, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v10, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs4r.v v8, (a1) # vscale x 32-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 6 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl4r.v v8, (a1) # vscale x 32-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 3 -; RV64-NEXT: sub a0, a1, a0 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: ret %v = call @llvm.vector.interleave3( %a, %b, %c) store %v, ptr %p ret void @@ -354,238 +264,22 @@ define void @vector_interleave_store_factor4( %a, %a, %b, %c, %d, %e, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor5: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: mv a2, a1 -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a2, a2, a1 -; RV32-NEXT: slli a1, a1, 1 -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: add a3, a1, a2 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg5e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v10, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v11, (a4) -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v12, (a4) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 10 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: mv a1, a0 -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: add a1, a1, a0 -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: add a0, a0, a1 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor5: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: mv a2, a1 -; RV64-NEXT: slli a1, a1, 2 -; RV64-NEXT: add a2, a2, a1 -; RV64-NEXT: slli a1, a1, 1 -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0d, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 13 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a3, a1, a2 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg5e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v10, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v11, (a4) -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v12, (a4) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 10 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: mv a1, a0 -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: add a1, a1, a0 -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add a0, a0, a1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: ret %v = call @llvm.vector.interleave5( %a, %b, %c, %d, %e) store %v, ptr %p ret void } define void @vector_interleave_store_factor7( %a, %b, %c, %d, %e, %f, %g, ptr %p) { -; RV32-LABEL: vector_interleave_store_factor7: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -32 -; RV32-NEXT: .cfi_def_cfa_offset 32 -; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 -; RV32-NEXT: .cfi_offset s0, -8 -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a2, a1, 4 -; RV32-NEXT: sub a1, a2, a1 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: add a3, a1, a2 -; RV32-NEXT: add a4, a3, a2 -; RV32-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV32-NEXT: vsseg7e32.v v8, (a1) -; RV32-NEXT: vl1re32.v v10, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v11, (a4) -; RV32-NEXT: add a4, a4, a2 -; RV32-NEXT: vl1re32.v v8, (a1) -; RV32-NEXT: add a1, a4, a2 -; RV32-NEXT: vl1re32.v v9, (a3) -; RV32-NEXT: vl1re32.v v12, (a4) -; RV32-NEXT: vl1re32.v v13, (a1) -; RV32-NEXT: add a1, a1, a2 -; RV32-NEXT: vl1re32.v v14, (a1) -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV32-NEXT: mv s0, a0 -; RV32-NEXT: srli a0, a2, 3 -; RV32-NEXT: li a1, 14 -; RV32-NEXT: call __mulsi3 -; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV32-NEXT: vse32.v v8, (s0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 4 -; RV32-NEXT: sub a0, a1, a0 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 32 -; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: .cfi_restore s0 -; RV32-NEXT: addi sp, sp, 32 -; RV32-NEXT: .cfi_def_cfa_offset 0 -; RV32-NEXT: ret -; -; RV64-LABEL: vector_interleave_store_factor7: -; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -32 -; RV64-NEXT: .cfi_def_cfa_offset 32 -; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: .cfi_offset ra, -8 -; RV64-NEXT: .cfi_offset s0, -16 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: sub a1, a2, a1 -; RV64-NEXT: sub sp, sp, a1 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x0f, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 15 * vlenb -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a1, a1, 3 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: add a3, a1, a2 -; RV64-NEXT: add a4, a3, a2 -; RV64-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; RV64-NEXT: vsseg7e32.v v8, (a1) -; RV64-NEXT: vl1re32.v v10, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v11, (a4) -; RV64-NEXT: add a4, a4, a2 -; RV64-NEXT: vl1re32.v v8, (a1) -; RV64-NEXT: add a1, a4, a2 -; RV64-NEXT: vl1re32.v v9, (a3) -; RV64-NEXT: vl1re32.v v12, (a4) -; RV64-NEXT: vl1re32.v v13, (a1) -; RV64-NEXT: add a1, a1, a2 -; RV64-NEXT: vl1re32.v v14, (a1) -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill -; RV64-NEXT: mv s0, a0 -; RV64-NEXT: srli a0, a2, 3 -; RV64-NEXT: li a1, 14 -; RV64-NEXT: call __muldi3 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload -; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; RV64-NEXT: vse32.v v8, (s0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 4 -; RV64-NEXT: sub a0, a1, a0 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 32 -; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: .cfi_restore ra -; RV64-NEXT: .cfi_restore s0 -; RV64-NEXT: addi sp, sp, 32 -; RV64-NEXT: .cfi_def_cfa_offset 0 -; RV64-NEXT: ret +; CHECK-LABEL: vector_interleave_store_factor7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vsseg7e32.v v8, (a0) +; CHECK-NEXT: ret %v = call @llvm.vector.interleave7( %a, %b, %c, %d, %e, %f, %g) store %v, ptr %p ret void diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 97fae479e0cb6..142ee5256f9e7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -31,52 +31,28 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % define {, , } @load_factor3_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor3_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 2 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; RV32-NEXT: slli a2, a1, 1 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs4r.v v8, (a0) -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg3e32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor3_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 2 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb ; RV64-NEXT: slli a2, a1, 1 ; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 699051 +; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs4r.v v8, (a0) -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg3e32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %rvl = mul i32 %evl, 3 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) @@ -128,52 +104,28 @@ define {, , , , , , , } @load_factor5_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor5_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV32-NEXT: slli a2, a1, 2 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: lui a2, 838861 +; RV32-NEXT: addi a2, a2, -819 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg5e32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor5_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: slli a2, a1, 2 ; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 838861 +; RV64-NEXT: addi a2, a2, -819 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg5e32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %rvl = mul i32 %evl, 5 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) @@ -194,52 +146,35 @@ define {, , , , , , , , , } @load_factor7_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor7_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV32-NEXT: slli a2, a1, 3 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v8, (a0) -; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vs8r.v v8, (a0) -; RV32-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV32-NEXT: lui a1, 149797 +; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: mulhu a1, a2, a1 +; RV32-NEXT: sub a2, a2, a1 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg7e32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor7_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 3 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: lui a3, 149797 ; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: srli a2, a2, 32 -; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: addi a0, sp, 16 -; RV64-NEXT: vs8r.v v8, (a0) -; RV64-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; RV64-NEXT: addi a1, a3, -1755 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: mulhu a1, a3, a1 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: subw a2, a2, a1 +; RV64-NEXT: srliw a2, a2, 1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg7e32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 3 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %rvl = mul i32 %evl, 7 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) @@ -338,74 +273,28 @@ define void @store_factor2_v2( %v0, %v1, pt define void @store_factor3_v2( %v0, %v1, %v2, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor3_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 1 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: srli a4, a3, 1 -; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; RV32-NEXT: vsseg3e32.v v8, (a2) -; RV32-NEXT: add a5, a2, a4 -; RV32-NEXT: vle32.v v9, (a5) -; RV32-NEXT: vle32.v v8, (a2) -; RV32-NEXT: srli a3, a3, 3 -; RV32-NEXT: add a2, a3, a3 -; RV32-NEXT: vsetvli zero, a2, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v8, v9, a3 -; RV32-NEXT: add a4, a5, a4 -; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v9, (a4) ; RV32-NEXT: slli a2, a1, 1 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg3e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor3_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 1 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: srli a4, a3, 1 -; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; RV64-NEXT: vsseg3e32.v v8, (a2) -; RV64-NEXT: add a5, a2, a4 -; RV64-NEXT: vle32.v v9, (a5) -; RV64-NEXT: vle32.v v8, (a2) ; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: srli a3, a3, 3 -; RV64-NEXT: add a6, a3, a3 -; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v8, v9, a3 -; RV64-NEXT: add a4, a5, a4 -; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v9, (a4) ; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 699051 +; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 1 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg3e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul i32 %evl, 3 %interleaved.vec = call @llvm.vector.interleave3( %v0, %v1, %v2) @@ -444,92 +333,28 @@ define void @store_factor4_v2( %v0, %v1, pt define void @store_factor5_v2( %v0, %v1, %v2, %v3, %v4, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor5_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a3, a2, 1 -; RV32-NEXT: add a2, a3, a2 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: srli a4, a3, 1 -; RV32-NEXT: add a5, a2, a4 -; RV32-NEXT: add a6, a5, a4 -; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; RV32-NEXT: vsseg5e32.v v8, (a2) -; RV32-NEXT: add a7, a6, a4 -; RV32-NEXT: vle32.v v8, (a7) -; RV32-NEXT: vle32.v v9, (a6) -; RV32-NEXT: srli a3, a3, 3 -; RV32-NEXT: add a6, a3, a3 -; RV32-NEXT: vle32.v v10, (a5) -; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v9, v8, a3 -; RV32-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v8, (a2) -; RV32-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v8, v10, a3 -; RV32-NEXT: add a4, a7, a4 -; RV32-NEXT: vsetvli a2, zero, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v10, (a4) ; RV32-NEXT: slli a2, a1, 2 ; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a1, a0, 1 -; RV32-NEXT: add a0, a1, a0 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: lui a2, 838861 +; RV32-NEXT: addi a2, a2, -819 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg5e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor5_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 1 -; RV64-NEXT: add a2, a3, a2 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 3 * vlenb -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: srli a4, a3, 1 -; RV64-NEXT: add a5, a2, a4 -; RV64-NEXT: add a6, a5, a4 -; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; RV64-NEXT: vsseg5e32.v v8, (a2) -; RV64-NEXT: add a7, a6, a4 -; RV64-NEXT: vle32.v v8, (a7) -; RV64-NEXT: vle32.v v9, (a6) -; RV64-NEXT: srli a3, a3, 3 -; RV64-NEXT: add a6, a3, a3 -; RV64-NEXT: vle32.v v10, (a5) -; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v9, v8, a3 -; RV64-NEXT: vsetvli a5, zero, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v8, (a2) ; RV64-NEXT: slli a2, a1, 2 -; RV64-NEXT: vsetvli zero, a6, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v8, v10, a3 -; RV64-NEXT: add a4, a7, a4 -; RV64-NEXT: vsetvli a3, zero, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v10, (a4) ; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 838861 +; RV64-NEXT: addi a2, a2, -819 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: vsetvli zero, a1, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a1, a0, 1 -; RV64-NEXT: add a0, a1, a0 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg5e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul i32 %evl, 5 %interleaved.vec = call @llvm.vector.interleave5( %v0, %v1, %v2, %v3, %v4) @@ -540,100 +365,35 @@ define void @store_factor5_v2( %v0, %v1, %v0, %v1, %v2, %v3, %v4, %v5, %v6, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor7_v2: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 2 -; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: srli a4, a3, 1 -; RV32-NEXT: srli a3, a3, 3 -; RV32-NEXT: add a5, a2, a4 -; RV32-NEXT: add a6, a5, a4 -; RV32-NEXT: add a7, a6, a4 -; RV32-NEXT: add t0, a7, a4 -; RV32-NEXT: vsetvli t1, zero, e32, mf2, ta, ma -; RV32-NEXT: vsseg7e32.v v8, (a2) -; RV32-NEXT: add t1, t0, a4 -; RV32-NEXT: vle32.v v8, (t1) -; RV32-NEXT: vle32.v v10, (t0) -; RV32-NEXT: add t0, a3, a3 -; RV32-NEXT: add a4, t1, a4 -; RV32-NEXT: vle32.v v12, (a7) -; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v10, v8, a3 -; RV32-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v11, (a4) -; RV32-NEXT: vle32.v v9, (a6) -; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v9, v12, a3 -; RV32-NEXT: vsetvli a4, zero, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v12, (a5) -; RV32-NEXT: vle32.v v8, (a2) ; RV32-NEXT: slli a2, a1, 3 ; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV32-NEXT: vslideup.vx v8, v12, a3 -; RV32-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 2 -; RV32-NEXT: add sp, sp, a0 -; RV32-NEXT: .cfi_def_cfa sp, 16 -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: lui a1, 149797 +; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: mulhu a1, a2, a1 +; RV32-NEXT: sub a2, a2, a1 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV32-NEXT: vsseg7e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor7_v2: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: .cfi_def_cfa_offset 16 -; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a2, a2, 2 -; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb -; RV64-NEXT: addi a2, sp, 16 -; RV64-NEXT: csrr a3, vlenb -; RV64-NEXT: srli a4, a3, 1 -; RV64-NEXT: srli a3, a3, 3 -; RV64-NEXT: add a5, a2, a4 -; RV64-NEXT: add a6, a5, a4 -; RV64-NEXT: add a7, a6, a4 -; RV64-NEXT: add t0, a7, a4 -; RV64-NEXT: vsetvli t1, zero, e32, mf2, ta, ma -; RV64-NEXT: vsseg7e32.v v8, (a2) -; RV64-NEXT: add t1, t0, a4 -; RV64-NEXT: vle32.v v8, (t1) -; RV64-NEXT: vle32.v v10, (t0) -; RV64-NEXT: add t0, a3, a3 -; RV64-NEXT: add a4, t1, a4 -; RV64-NEXT: vle32.v v12, (a7) -; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v10, v8, a3 -; RV64-NEXT: vsetvli a7, zero, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v11, (a4) -; RV64-NEXT: vle32.v v9, (a6) -; RV64-NEXT: vle32.v v13, (a5) -; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v9, v12, a3 -; RV64-NEXT: vsetvli a4, zero, e32, mf2, ta, ma -; RV64-NEXT: vle32.v v8, (a2) ; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: lui a3, 149797 ; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: vsetvli zero, t0, e32, m1, ta, ma -; RV64-NEXT: vslideup.vx v8, v13, a3 -; RV64-NEXT: srli a2, a2, 32 -; RV64-NEXT: vsetvli zero, a2, e32, m4, ta, ma -; RV64-NEXT: vse32.v v8, (a0) -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: slli a0, a0, 2 -; RV64-NEXT: add sp, sp, a0 -; RV64-NEXT: .cfi_def_cfa sp, 16 -; RV64-NEXT: addi sp, sp, 16 -; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: addi a1, a3, -1755 +; RV64-NEXT: slli a3, a2, 32 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: mulhu a1, a3, a1 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: subw a2, a2, a1 +; RV64-NEXT: srliw a2, a2, 1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma +; RV64-NEXT: vsseg7e32.v v8, (a0) ; RV64-NEXT: ret %rvl = mul i32 %evl, 7 %interleaved.vec = call @llvm.vector.interleave7( %v0, %v1, %v2, %v3, %v4, %v5, %v6) From cbf92d8e772cebef021a92b593b431ddb45eff85 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 12 May 2025 08:48:07 +0300 Subject: [PATCH 3/5] Add opt tests --- .../RISCV/interleaved-accesses.ll | 218 ++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll index f2e2950992421..bd79ec9a09599 100644 --- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -73,6 +73,41 @@ define void @load_factor3(ptr %ptr) { ret void } +define void @load_factor3_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor3_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t.i32(target("riscv.vector.tuple", , 3) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 2) +; RV32-NEXT: [[V:%.*]] = insertvalue { , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[T0:%.*]] = extractvalue { , , } [[V]], 0 +; RV32-NEXT: [[T1:%.*]] = extractvalue { , , } [[V]], 1 +; RV32-NEXT: [[T2:%.*]] = extractvalue { , , } [[V]], 2 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor3_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv8i8_3t.i64(target("riscv.vector.tuple", , 3) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_3t(target("riscv.vector.tuple", , 3) [[TMP1]], i32 2) +; RV64-NEXT: [[V:%.*]] = insertvalue { , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[T0:%.*]] = extractvalue { , , } [[V]], 0 +; RV64-NEXT: [[T1:%.*]] = extractvalue { , , } [[V]], 1 +; RV64-NEXT: [[T2:%.*]] = extractvalue { , , } [[V]], 2 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , } @llvm.vector.deinterleave3.nxv6i32( %interleaved.vec) + %t0 = extractvalue { , , } %v, 0 + %t1 = extractvalue { , , } %v, 1 + %t2 = extractvalue { , , } %v, 2 + ret void +} + define void @load_factor4(ptr %ptr) { ; RV32-LABEL: @load_factor4( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg4.load.mask.v4i32.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4) @@ -172,6 +207,55 @@ define void @load_factor5(ptr %ptr) { ret void } +define void @load_factor5_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor5_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t.i32(target("riscv.vector.tuple", , 5) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 4) +; RV32-NEXT: [[V:%.*]] = insertvalue { , , , , } [[TMP9]], [[TMP10]], 4 +; RV32-NEXT: [[T0:%.*]] = extractvalue { , , , , } [[V]], 0 +; RV32-NEXT: [[T1:%.*]] = extractvalue { , , , , } [[V]], 1 +; RV32-NEXT: [[T2:%.*]] = extractvalue { , , , , } [[V]], 2 +; RV32-NEXT: [[T3:%.*]] = extractvalue { , , , , } [[V]], 3 +; RV32-NEXT: [[T4:%.*]] = extractvalue { , , , , } [[V]], 4 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor5_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv8i8_5t.i64(target("riscv.vector.tuple", , 5) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_5t(target("riscv.vector.tuple", , 5) [[TMP1]], i32 4) +; RV64-NEXT: [[V:%.*]] = insertvalue { , , , , } [[TMP9]], [[TMP10]], 4 +; RV64-NEXT: [[T0:%.*]] = extractvalue { , , , , } [[V]], 0 +; RV64-NEXT: [[T1:%.*]] = extractvalue { , , , , } [[V]], 1 +; RV64-NEXT: [[T2:%.*]] = extractvalue { , , , , } [[V]], 2 +; RV64-NEXT: [[T3:%.*]] = extractvalue { , , , , } [[V]], 3 +; RV64-NEXT: [[T4:%.*]] = extractvalue { , , , , } [[V]], 4 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , , , } @llvm.vector.deinterleave5.nxv10i32( %interleaved.vec) + %t0 = extractvalue { , , , , } %v, 0 + %t1 = extractvalue { , , , , } %v, 1 + %t2 = extractvalue { , , , , } %v, 2 + %t3 = extractvalue { , , , , } %v, 3 + %t4 = extractvalue { , , , , } %v, 4 + ret void +} + define void @load_factor6(ptr %ptr) { ; RV32-LABEL: @load_factor6( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg6.load.mask.v4i32.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4) @@ -237,6 +321,69 @@ define void @load_factor7(ptr %ptr) { ret void } +define void @load_factor7_vscale(ptr %ptr) { +; RV32-LABEL: @load_factor7_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t.i32(target("riscv.vector.tuple", , 7) poison, ptr [[PTR:%.*]], i32 -1, i32 5) +; RV32-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 0) +; RV32-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , } poison, [[TMP2]], 0 +; RV32-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 1) +; RV32-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV32-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 2) +; RV32-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV32-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 3) +; RV32-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV32-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 4) +; RV32-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV32-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 5) +; RV32-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV32-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 6) +; RV32-NEXT: [[V:%.*]] = insertvalue { , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV32-NEXT: [[T0:%.*]] = extractvalue { , , , , , , } [[V]], 0 +; RV32-NEXT: [[T1:%.*]] = extractvalue { , , , , , , } [[V]], 1 +; RV32-NEXT: [[T2:%.*]] = extractvalue { , , , , , , } [[V]], 2 +; RV32-NEXT: [[T3:%.*]] = extractvalue { , , , , , , } [[V]], 3 +; RV32-NEXT: [[T4:%.*]] = extractvalue { , , , , , , } [[V]], 4 +; RV32-NEXT: [[T5:%.*]] = extractvalue { , , , , , , } [[V]], 5 +; RV32-NEXT: [[T6:%.*]] = extractvalue { , , , , , , } [[V]], 6 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor7_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv8i8_7t.i64(target("riscv.vector.tuple", , 7) poison, ptr [[PTR:%.*]], i64 -1, i64 5) +; RV64-NEXT: [[TMP2:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 0) +; RV64-NEXT: [[TMP3:%.*]] = insertvalue { , , , , , , } poison, [[TMP2]], 0 +; RV64-NEXT: [[TMP4:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 1) +; RV64-NEXT: [[TMP5:%.*]] = insertvalue { , , , , , , } [[TMP3]], [[TMP4]], 1 +; RV64-NEXT: [[TMP6:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 2) +; RV64-NEXT: [[TMP7:%.*]] = insertvalue { , , , , , , } [[TMP5]], [[TMP6]], 2 +; RV64-NEXT: [[TMP8:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 3) +; RV64-NEXT: [[TMP9:%.*]] = insertvalue { , , , , , , } [[TMP7]], [[TMP8]], 3 +; RV64-NEXT: [[TMP10:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 4) +; RV64-NEXT: [[TMP11:%.*]] = insertvalue { , , , , , , } [[TMP9]], [[TMP10]], 4 +; RV64-NEXT: [[TMP12:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 5) +; RV64-NEXT: [[TMP13:%.*]] = insertvalue { , , , , , , } [[TMP11]], [[TMP12]], 5 +; RV64-NEXT: [[TMP14:%.*]] = call @llvm.riscv.tuple.extract.nxv2i32.triscv.vector.tuple_nxv8i8_7t(target("riscv.vector.tuple", , 7) [[TMP1]], i32 6) +; RV64-NEXT: [[V:%.*]] = insertvalue { , , , , , , } [[TMP13]], [[TMP14]], 6 +; RV64-NEXT: [[T0:%.*]] = extractvalue { , , , , , , } [[V]], 0 +; RV64-NEXT: [[T1:%.*]] = extractvalue { , , , , , , } [[V]], 1 +; RV64-NEXT: [[T2:%.*]] = extractvalue { , , , , , , } [[V]], 2 +; RV64-NEXT: [[T3:%.*]] = extractvalue { , , , , , , } [[V]], 3 +; RV64-NEXT: [[T4:%.*]] = extractvalue { , , , , , , } [[V]], 4 +; RV64-NEXT: [[T5:%.*]] = extractvalue { , , , , , , } [[V]], 5 +; RV64-NEXT: [[T6:%.*]] = extractvalue { , , , , , , } [[V]], 6 +; RV64-NEXT: ret void +; + %interleaved.vec = load , ptr %ptr + %v = call { , , , , , , } @llvm.vector.deinterleave7.nxv14i32( %interleaved.vec) + %t0 = extractvalue { , , , , , , } %v, 0 + %t1 = extractvalue { , , , , , , } %v, 1 + %t2 = extractvalue { , , , , , , } %v, 2 + %t3 = extractvalue { , , , , , , } %v, 3 + %t4 = extractvalue { , , , , , , } %v, 4 + %t5 = extractvalue { , , , , , , } %v, 5 + %t6 = extractvalue { , , , , , , } %v, 6 + ret void +} + define void @load_factor8(ptr %ptr) { ; RV32-LABEL: @load_factor8( ; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg8.load.mask.v4i32.i32(ptr [[PTR:%.*]], <4 x i1> splat (i1 true), i32 4) @@ -421,6 +568,26 @@ define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2 ret void } +define void @store_factor3_vscale(ptr %ptr, %v0, %v1, %v2) { +; RV32-LABEL: @store_factor3_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv8i8_3t.i32(target("riscv.vector.tuple", , 3) [[TMP3]], ptr [[PTR:%.*]], i32 -1, i32 3) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor3_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 3) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv8i8_3t.i64(target("riscv.vector.tuple", , 3) [[TMP3]], ptr [[PTR:%.*]], i64 -1, i64 3) +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave3.nxv8i8( %v0, %v1, %v2) + store %interleaved.vec, ptr %ptr + ret void +} + define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { ; RV32-LABEL: @store_factor4( ; RV32-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> @@ -473,6 +640,29 @@ define void @store_factor4_vscale(ptr %ptr, %v0, %v0, %v1, %v2, %v3, %v4) { +; RV32-LABEL: @store_factor5_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP3]], [[V3:%.*]], i32 3) +; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP4]], [[V4:%.*]], i32 4) +; RV32-NEXT: call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv8i8_5t.i32(target("riscv.vector.tuple", , 5) [[TMP5]], ptr [[PTR:%.*]], i32 -1, i32 3) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor5_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP3]], [[V3:%.*]], i32 3) +; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 5) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) [[TMP4]], [[V4:%.*]], i32 4) +; RV64-NEXT: call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv8i8_5t.i64(target("riscv.vector.tuple", , 5) [[TMP5]], ptr [[PTR:%.*]], i64 -1, i64 3) +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave5.nxv8i8( %v0, %v1, %v2, %v3, %v4) + store %interleaved.vec, ptr %ptr + ret void +} define void @store_factor2_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1) { ; RV32-LABEL: @store_factor2_wide( @@ -546,6 +736,34 @@ define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32 ret void } +define void @store_factor7_vscale(ptr %ptr, %v0, %v1, %v2, %v3, %v4, %v5, %v6) { +; RV32-LABEL: @store_factor7_vscale( +; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, [[V0:%.*]], i32 0) +; RV32-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP1]], [[V1:%.*]], i32 1) +; RV32-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP2]], [[V2:%.*]], i32 2) +; RV32-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP3]], [[V3:%.*]], i32 3) +; RV32-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP4]], [[V4:%.*]], i32 4) +; RV32-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP5]], [[V5:%.*]], i32 5) +; RV32-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP6]], [[V6:%.*]], i32 6) +; RV32-NEXT: call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv8i8_7t.i32(target("riscv.vector.tuple", , 7) [[TMP7]], ptr [[PTR:%.*]], i32 -1, i32 3) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor7_vscale( +; RV64-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, [[V0:%.*]], i32 0) +; RV64-NEXT: [[TMP2:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP1]], [[V1:%.*]], i32 1) +; RV64-NEXT: [[TMP3:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP2]], [[V2:%.*]], i32 2) +; RV64-NEXT: [[TMP4:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP3]], [[V3:%.*]], i32 3) +; RV64-NEXT: [[TMP5:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP4]], [[V4:%.*]], i32 4) +; RV64-NEXT: [[TMP6:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP5]], [[V5:%.*]], i32 5) +; RV64-NEXT: [[TMP7:%.*]] = call target("riscv.vector.tuple", , 7) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) [[TMP6]], [[V6:%.*]], i32 6) +; RV64-NEXT: call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv8i8_7t.i64(target("riscv.vector.tuple", , 7) [[TMP7]], ptr [[PTR:%.*]], i64 -1, i64 3) +; RV64-NEXT: ret void +; + %interleaved.vec = call @llvm.vector.interleave7.nxv8i8( %v0, %v1, %v2, %v3, %v4, %v5, %v6) + store %interleaved.vec, ptr %ptr + ret void +} + define void @store_factor8_vscale(ptr %ptr, %v0, %v1, %v2, %v3) { ; RV32-LABEL: @store_factor8_vscale( ; RV32-NEXT: [[TMP1:%.*]] = call target("riscv.vector.tuple", , 8) @llvm.riscv.tuple.insert.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, [[V0:%.*]], i32 0) From b9f0574867754e86e3860a4247554f6261ce6779 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Mon, 12 May 2025 09:01:40 +0300 Subject: [PATCH 4/5] Update comment to mention other intrinsics --- llvm/include/llvm/CodeGen/TargetLowering.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 03099e9ad44dc..1749ac1770da9 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3234,7 +3234,7 @@ class TargetLoweringBase { /// Lower a deinterleave intrinsic to a target specific load intrinsic. /// Return true on success. Currently only supports - /// llvm.vector.deinterleave2 + /// llvm.vector.deinterleave{2,3,5,7} /// /// \p LI is the accompanying load instruction. /// \p DeinterleaveValues contains the deinterleaved values. @@ -3246,7 +3246,7 @@ class TargetLoweringBase { /// Lower an interleave intrinsic to a target specific store intrinsic. /// Return true on success. Currently only supports - /// llvm.vector.interleave2 + /// llvm.vector.interleave{2,3,5,7} /// /// \p SI is the accompanying store instruction /// \p InterleaveValues contains the interleaved values. From 33652d34bc7f0ccd19c602566e01d58d48986898 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Tue, 13 May 2025 00:04:31 +0100 Subject: [PATCH 5/5] Update comments --- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index c590e470fa779..960c7956e0011 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -660,7 +660,7 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl &Operands, } const unsigned Factor = Operands.size(); - // Currently we only recognize factors of 2, 3, 5 and 7. + // Currently we only recognize factors of 3, 5, 7, and powers of 2. // FIXME: should we assert here instead? if (Factor <= 1 || (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II))) @@ -737,7 +737,7 @@ getVectorDeinterleaveFactor(IntrinsicInst *II, } const unsigned Factor = Results.size(); - // Currently we only recognize factors of 2, 3, 5 and 7. + // Currently we only recognize factors of 3, 5, 7, and powers of 2. // FIXME: should we assert here instead? if (Factor <= 1 || (!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))