Skip to content

Commit bb8a877

Browse files
authored
[RISCV] Exploit register boundaries when lowering shuffle with exact vlen (#79072)
If we have a shuffle which is larger than m1, we may be able to split it into a series of individual m1 shuffles. This patch starts with the subcase where the mask allows a 1-to-1 mapping from source register to destination register - each with a possible permutation of their own. We can potentially extend this later, thought in practice this seems to already catch a number of the most interesting cases.
1 parent 0cf20c2 commit bb8a877

File tree

2 files changed

+118
-66
lines changed

2 files changed

+118
-66
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+84
Original file line numberDiff line numberDiff line change
@@ -4652,6 +4652,85 @@ static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
46524652
return DAG.getBitcast(VT, Rotate);
46534653
}
46544654

4655+
// If compiling with an exactly known VLEN, see if we can split a
4656+
// shuffle on m2 or larger into a small number of m1 sized shuffles
4657+
// which write each destination registers exactly once.
4658+
static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
4659+
SelectionDAG &DAG,
4660+
const RISCVSubtarget &Subtarget) {
4661+
SDLoc DL(SVN);
4662+
MVT VT = SVN->getSimpleValueType(0);
4663+
SDValue V1 = SVN->getOperand(0);
4664+
SDValue V2 = SVN->getOperand(1);
4665+
ArrayRef<int> Mask = SVN->getMask();
4666+
unsigned NumElts = VT.getVectorNumElements();
4667+
4668+
// If we don't know exact data layout, not much we can do. If this
4669+
// is already m1 or smaller, no point in splitting further.
4670+
const unsigned MinVLen = Subtarget.getRealMinVLen();
4671+
const unsigned MaxVLen = Subtarget.getRealMaxVLen();
4672+
if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
4673+
return SDValue();
4674+
4675+
MVT ElemVT = VT.getVectorElementType();
4676+
unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
4677+
unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
4678+
4679+
SmallVector<std::pair<int, SmallVector<int>>>
4680+
OutMasks(VRegsPerSrc, {-1, {}});
4681+
4682+
// Check if our mask can be done as a 1-to-1 mapping from source
4683+
// to destination registers in the group without needing to
4684+
// write each destination more than once.
4685+
for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
4686+
int DstVecIdx = DstIdx / ElemsPerVReg;
4687+
int DstSubIdx = DstIdx % ElemsPerVReg;
4688+
int SrcIdx = Mask[DstIdx];
4689+
if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
4690+
continue;
4691+
int SrcVecIdx = SrcIdx / ElemsPerVReg;
4692+
int SrcSubIdx = SrcIdx % ElemsPerVReg;
4693+
if (OutMasks[DstVecIdx].first == -1)
4694+
OutMasks[DstVecIdx].first = SrcVecIdx;
4695+
if (OutMasks[DstVecIdx].first != SrcVecIdx)
4696+
// Note: This case could easily be handled by keeping track of a chain
4697+
// of source values and generating two element shuffles below. This is
4698+
// less an implementation question, and more a profitability one.
4699+
return SDValue();
4700+
4701+
OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
4702+
OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
4703+
}
4704+
4705+
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
4706+
MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
4707+
MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
4708+
assert(M1VT == getLMUL1VT(M1VT));
4709+
unsigned NumOpElts = M1VT.getVectorMinNumElements();
4710+
SDValue Vec = DAG.getUNDEF(ContainerVT);
4711+
// The following semantically builds up a fixed length concat_vector
4712+
// of the component shuffle_vectors. We eagerly lower to scalable here
4713+
// to avoid DAG combining it back to a large shuffle_vector again.
4714+
V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
4715+
V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
4716+
for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
4717+
auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
4718+
if (SrcVecIdx == -1)
4719+
continue;
4720+
unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
4721+
SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1;
4722+
SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
4723+
DAG.getVectorIdxConstant(ExtractIdx, DL));
4724+
SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
4725+
SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
4726+
SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
4727+
unsigned InsertIdx = DstVecIdx * NumOpElts;
4728+
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
4729+
DAG.getVectorIdxConstant(InsertIdx, DL));
4730+
}
4731+
return convertFromScalableVector(VT, Vec, DAG, Subtarget);
4732+
}
4733+
46554734
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
46564735
const RISCVSubtarget &Subtarget) {
46574736
SDValue V1 = Op.getOperand(0);
@@ -4759,6 +4838,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
47594838
}
47604839
}
47614840

4841+
// For exact VLEN m2 or greater, try to split to m1 operations if we
4842+
// can split cleanly.
4843+
if (SDValue V = lowerShuffleViaVRegSplitting(SVN, DAG, Subtarget))
4844+
return V;
4845+
47624846
ArrayRef<int> Mask = SVN->getMask();
47634847

47644848
if (SDValue V =

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll

+34-66
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,10 @@ define <4 x i64> @m2_splat_0(<4 x i64> %v1) vscale_range(2,2) {
1616
define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) {
1717
; CHECK-LABEL: m2_splat_in_chunks:
1818
; CHECK: # %bb.0:
19-
; CHECK-NEXT: lui a0, 8224
20-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
21-
; CHECK-NEXT: vmv.s.x v10, a0
22-
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
23-
; CHECK-NEXT: vsext.vf2 v12, v10
24-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
25-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
26-
; CHECK-NEXT: vmv.v.v v8, v10
19+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
20+
; CHECK-NEXT: vrgather.vi v10, v8, 0
21+
; CHECK-NEXT: vrgather.vi v11, v9, 0
22+
; CHECK-NEXT: vmv2r.v v8, v10
2723
; CHECK-NEXT: ret
2824
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
2925
ret <4 x i64> %res
@@ -32,12 +28,12 @@ define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) {
3228
define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) {
3329
; CHECK-LABEL: m4_splat_in_chunks:
3430
; CHECK: # %bb.0:
35-
; CHECK-NEXT: lui a0, %hi(.LCPI2_0)
36-
; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0)
37-
; CHECK-NEXT: vl1re16.v v16, (a0)
38-
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
39-
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
40-
; CHECK-NEXT: vmv.v.v v8, v12
31+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
32+
; CHECK-NEXT: vrgather.vi v12, v8, 0
33+
; CHECK-NEXT: vrgather.vi v13, v9, 0
34+
; CHECK-NEXT: vrgather.vi v14, v10, 0
35+
; CHECK-NEXT: vrgather.vi v15, v11, 1
36+
; CHECK-NEXT: vmv4r.v v8, v12
4137
; CHECK-NEXT: ret
4238
%res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 7, i32 7>
4339
ret <8 x i64> %res
@@ -47,14 +43,10 @@ define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) {
4743
define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
4844
; CHECK-LABEL: m2_splat_with_tail:
4945
; CHECK: # %bb.0:
50-
; CHECK-NEXT: lui a0, 12320
51-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
52-
; CHECK-NEXT: vmv.s.x v10, a0
53-
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
54-
; CHECK-NEXT: vsext.vf2 v12, v10
55-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
56-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
57-
; CHECK-NEXT: vmv.v.v v8, v10
46+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
47+
; CHECK-NEXT: vrgather.vi v10, v8, 0
48+
; CHECK-NEXT: vmv1r.v v11, v9
49+
; CHECK-NEXT: vmv2r.v v8, v10
5850
; CHECK-NEXT: ret
5951
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
6052
ret <4 x i64> %res
@@ -63,15 +55,12 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
6355
define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) {
6456
; CHECK-LABEL: m2_pair_swap_vl4:
6557
; CHECK: # %bb.0:
66-
; CHECK-NEXT: lui a0, 8240
67-
; CHECK-NEXT: addi a0, a0, 1
68-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
69-
; CHECK-NEXT: vmv.s.x v10, a0
70-
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
71-
; CHECK-NEXT: vsext.vf2 v12, v10
72-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
73-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
74-
; CHECK-NEXT: vmv.v.v v8, v10
58+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
59+
; CHECK-NEXT: vslidedown.vi v11, v9, 1
60+
; CHECK-NEXT: vslideup.vi v11, v9, 1
61+
; CHECK-NEXT: vslidedown.vi v10, v8, 1
62+
; CHECK-NEXT: vslideup.vi v10, v8, 1
63+
; CHECK-NEXT: vmv2r.v v8, v10
7564
; CHECK-NEXT: ret
7665
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
7766
ret <4 x i64> %res
@@ -107,14 +96,10 @@ define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) {
10796
define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
10897
; CHECK-LABEL: m2_splat_into_identity:
10998
; CHECK: # %bb.0:
110-
; CHECK-NEXT: lui a0, 12320
111-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
112-
; CHECK-NEXT: vmv.s.x v10, a0
113-
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
114-
; CHECK-NEXT: vsext.vf2 v12, v10
115-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
116-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
117-
; CHECK-NEXT: vmv.v.v v8, v10
99+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
100+
; CHECK-NEXT: vrgather.vi v10, v8, 0
101+
; CHECK-NEXT: vmv1r.v v11, v9
102+
; CHECK-NEXT: vmv2r.v v8, v10
118103
; CHECK-NEXT: ret
119104
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
120105
ret <4 x i64> %res
@@ -123,12 +108,7 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
123108
define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) {
124109
; CHECK-LABEL: m2_broadcast_i128:
125110
; CHECK: # %bb.0:
126-
; CHECK-NEXT: lui a0, 16
127-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
128-
; CHECK-NEXT: vmv.v.x v12, a0
129-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
130-
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
131-
; CHECK-NEXT: vmv.v.v v8, v10
111+
; CHECK-NEXT: vmv1r.v v9, v8
132112
; CHECK-NEXT: ret
133113
%res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
134114
ret <4 x i64> %res
@@ -137,12 +117,9 @@ define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) {
137117
define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) {
138118
; CHECK-LABEL: m4_broadcast_i128:
139119
; CHECK: # %bb.0:
140-
; CHECK-NEXT: lui a0, 16
141-
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
142-
; CHECK-NEXT: vmv.v.x v16, a0
143-
; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma
144-
; CHECK-NEXT: vrgatherei16.vv v12, v8, v16
145-
; CHECK-NEXT: vmv.v.v v8, v12
120+
; CHECK-NEXT: vmv1r.v v9, v8
121+
; CHECK-NEXT: vmv1r.v v10, v8
122+
; CHECK-NEXT: vmv1r.v v11, v8
146123
; CHECK-NEXT: ret
147124
%res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
148125
ret <8 x i64> %res
@@ -152,13 +129,10 @@ define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) {
152129
define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
153130
; CHECK-LABEL: m2_splat_two_source:
154131
; CHECK: # %bb.0:
155-
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
132+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
156133
; CHECK-NEXT: vrgather.vi v12, v8, 0
157-
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
158-
; CHECK-NEXT: vmv.v.i v0, 12
159-
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu
160-
; CHECK-NEXT: vrgather.vi v12, v10, 3, v0.t
161-
; CHECK-NEXT: vmv.v.v v8, v12
134+
; CHECK-NEXT: vrgather.vi v13, v11, 1
135+
; CHECK-NEXT: vmv2r.v v8, v12
162136
; CHECK-NEXT: ret
163137
%res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 7, i32 7>
164138
ret <4 x i64> %res
@@ -167,15 +141,9 @@ define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range
167141
define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
168142
; CHECK-LABEL: m2_splat_into_identity_two_source:
169143
; CHECK: # %bb.0:
170-
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
171-
; CHECK-NEXT: vrgather.vi v12, v8, 0
172-
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
173-
; CHECK-NEXT: vmv.v.i v0, 12
174-
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
175-
; CHECK-NEXT: vid.v v8
176-
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu
177-
; CHECK-NEXT: vrgatherei16.vv v12, v10, v8, v0.t
178-
; CHECK-NEXT: vmv.v.v v8, v12
144+
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma
145+
; CHECK-NEXT: vrgather.vi v10, v8, 0
146+
; CHECK-NEXT: vmv2r.v v8, v10
179147
; CHECK-NEXT: ret
180148
%res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 6, i32 7>
181149
ret <4 x i64> %res

0 commit comments

Comments
 (0)