Skip to content

[RISCV] Use vmv.v.x to materialize masks in deinterleave2 lowering #118500

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10744,23 +10744,23 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
return DAG.getMergeValues({Even, Odd}, DL);
}

// For the indices, use the same SEW to avoid an extra vsetvli
// TODO: If container type is larger than m1, we can consider using a splat
// of a constant instead of the following sequence

// Create a vector of even indices {0, 1, 2, ...}
MVT IdxVT = ConcatVT.changeVectorElementTypeToInteger();
SDValue StepVec = DAG.getStepVector(DL, IdxVT);
// 0, 1, 0, 1, 0, 1
SDValue ZeroOnes =
DAG.getNode(ISD::AND, DL, IdxVT, StepVec, DAG.getConstant(1, DL, IdxVT));
// For the indices, use the vmv.v.x of an i8 constant to fill the largest
// possibly mask vector, then extract the required subvector. Doing this
// (instead of a vid, vmsne sequence) reduces LMUL, and allows the mask
// creation to be rematerialized during register allocation to reduce
// register pressure if needed.

MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
SDValue EvenMask =
DAG.getSetCC(DL, MaskVT, ZeroOnes, DAG.getConstant(0, DL, IdxVT),
ISD::CondCode::SETEQ);
// Have the latter be the not of the former to minimize the live range of
// the index vector since that might be large.
SDValue OddMask = DAG.getLogicalNOT(DL, EvenMask, MaskVT);

SDValue EvenSplat = DAG.getConstant(0b01010101, DL, MVT::nxv8i8);
EvenSplat = DAG.getBitcast(MVT::nxv64i1, EvenSplat);
SDValue EvenMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, EvenSplat,
DAG.getVectorIdxConstant(0, DL));

SDValue OddSplat = DAG.getConstant(0b10101010, DL, MVT::nxv8i8);
OddSplat = DAG.getBitcast(MVT::nxv64i1, OddSplat);
SDValue OddMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, OddSplat,
DAG.getVectorIdxConstant(0, DL));

// vcompress the even and odd elements into two separate vectors
SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
Expand Down
58 changes: 23 additions & 35 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,56 +106,44 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_load_nxv8i6
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: li a2, 24
; CHECK-NEXT: mul a1, a1, a2
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: li a1, 85
; CHECK-NEXT: vsetvli a2, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v16, a1
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: vl8re64.v v16, (a0)
; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v8
; CHECK-NEXT: vl8re64.v v24, (a0)
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: vand.vi v8, v8, 1
; CHECK-NEXT: add a0, a0, a1
; CHECK-NEXT: vmseq.vi v24, v8, 0
; CHECK-NEXT: vl8re64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmnot.m v6, v24
; CHECK-NEXT: vcompress.vm v8, v16, v24
; CHECK-NEXT: vmv1r.v v13, v24
; CHECK-NEXT: vcompress.vm v24, v16, v6
; CHECK-NEXT: vmv1r.v v12, v6
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vcompress.vm v0, v16, v13
; CHECK-NEXT: li a1, 170
; CHECK-NEXT: vl8re64.v v0, (a0)
; CHECK-NEXT: vmv.v.x v17, a1
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vcompress.vm v8, v24, v16
; CHECK-NEXT: vmv1r.v v12, v16
; CHECK-NEXT: vmv1r.v v13, v17
; CHECK-NEXT: vcompress.vm v16, v24, v13
; CHECK-NEXT: vcompress.vm v24, v0, v12
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v0, v16, v12
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v24, v0, v13
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v12, v16
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v12, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v28, v16
; CHECK-NEXT: vmv8r.v v16, v24
; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
Expand Down
141 changes: 63 additions & 78 deletions llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,16 @@ ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %retval
define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv4i64(<vscale x 4 x i64> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64:
; CHECK: # %bb.0:
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.x v16, a0
; CHECK-NEXT: li a0, 170
; CHECK-NEXT: vmv.v.x v17, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vand.vi v12, v12, 1
; CHECK-NEXT: vmseq.vi v16, v12, 0
; CHECK-NEXT: vcompress.vm v12, v8, v16
; CHECK-NEXT: vmnot.m v14, v16
; CHECK-NEXT: vcompress.vm v16, v8, v14
; CHECK-NEXT: vcompress.vm v20, v8, v17
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: vmv2r.v v10, v16
; CHECK-NEXT: vmv2r.v v10, v20
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %vec)
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
Expand All @@ -88,15 +89,16 @@ ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv8i64(<vscale x 8 x i64> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv4i64_nxv8i64:
; CHECK: # %bb.0:
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v24, a0
; CHECK-NEXT: li a0, 170
; CHECK-NEXT: vmv.v.x v25, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vand.vi v16, v16, 1
; CHECK-NEXT: vmseq.vi v24, v16, 0
; CHECK-NEXT: vcompress.vm v16, v8, v24
; CHECK-NEXT: vmnot.m v20, v24
; CHECK-NEXT: vcompress.vm v24, v8, v20
; CHECK-NEXT: vcompress.vm v0, v8, v25
; CHECK-NEXT: vmv4r.v v8, v16
; CHECK-NEXT: vmv4r.v v12, v24
; CHECK-NEXT: vmv4r.v v12, v0
; CHECK-NEXT: ret
%retval = call {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64> %vec)
ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
Expand Down Expand Up @@ -182,50 +184,41 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v7, a0
; CHECK-NEXT: li a0, 170
; CHECK-NEXT: vmv.v.x v6, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vand.vi v24, v16, 1
; CHECK-NEXT: vmseq.vi v16, v24, 0
; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: vcompress.vm v24, v8, v7
; CHECK-NEXT: vmv1r.v v28, v7
; CHECK-NEXT: vmv1r.v v29, v6
; CHECK-NEXT: vcompress.vm v0, v8, v29
; CHECK-NEXT: vcompress.vm v8, v16, v28
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmnot.m v17, v16
; CHECK-NEXT: vcompress.vm v0, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v8, v16, v29
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v24, v8, v17
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v28, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
; CHECK-NEXT: vmv4r.v v4, v24
; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: vmv4r.v v4, v8
; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
Expand Down Expand Up @@ -350,15 +343,16 @@ ret {<vscale x 4 x float>, <vscale x 4 x float>} %retval
define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv4f64(<vscale x 4 x double> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64:
; CHECK: # %bb.0:
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
; CHECK-NEXT: vmv.v.x v16, a0
; CHECK-NEXT: li a0, 170
; CHECK-NEXT: vmv.v.x v17, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma
; CHECK-NEXT: vid.v v12
; CHECK-NEXT: vand.vi v12, v12, 1
; CHECK-NEXT: vmseq.vi v16, v12, 0
; CHECK-NEXT: vcompress.vm v12, v8, v16
; CHECK-NEXT: vmnot.m v14, v16
; CHECK-NEXT: vcompress.vm v16, v8, v14
; CHECK-NEXT: vcompress.vm v20, v8, v17
; CHECK-NEXT: vmv2r.v v8, v12
; CHECK-NEXT: vmv2r.v v10, v16
; CHECK-NEXT: vmv2r.v v10, v20
; CHECK-NEXT: ret
%retval = call {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %vec)
ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
Expand Down Expand Up @@ -423,50 +417,41 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
; CHECK-NEXT: li a0, 85
; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.x v7, a0
; CHECK-NEXT: li a0, 170
; CHECK-NEXT: vmv.v.x v6, a0
; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma
; CHECK-NEXT: vid.v v16
; CHECK-NEXT: vand.vi v24, v16, 1
; CHECK-NEXT: vmseq.vi v16, v24, 0
; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: vcompress.vm v24, v8, v7
; CHECK-NEXT: vmv1r.v v28, v7
; CHECK-NEXT: vmv1r.v v29, v6
; CHECK-NEXT: vcompress.vm v0, v8, v29
; CHECK-NEXT: vcompress.vm v8, v16, v28
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vmnot.m v17, v16
; CHECK-NEXT: vcompress.vm v0, v8, v17
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vcompress.vm v24, v8, v16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v8, v16, v29
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: vcompress.vm v24, v8, v17
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v28, v8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
; CHECK-NEXT: vmv4r.v v20, v8
; CHECK-NEXT: vmv4r.v v4, v24
; CHECK-NEXT: vmv8r.v v8, v16
; CHECK-NEXT: vmv4r.v v4, v8
; CHECK-NEXT: vmv8r.v v8, v24
; CHECK-NEXT: vmv8r.v v16, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 24
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
Expand Down
Loading