Skip to content

Commit 49fd2dd

Browse files
authored
[X86] LowerShift - don't prematurely lower to x86 vector shift imm instructions (#120282)
When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets. Noticed on #120270
1 parent 3eca15c commit 49fd2dd

File tree

5 files changed

+25
-25
lines changed

5 files changed

+25
-25
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30105,9 +30105,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3010530105
(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
3010630106
canWidenShuffleElements(ShuffleMask))) {
3010730107
SDValue Shift1 =
30108-
getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtA, DAG);
30108+
DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtA, dl, VT));
3010930109
SDValue Shift2 =
30110-
getTargetVShiftByConstNode(X86OpcI, dl, VT, R, AmtB, DAG);
30110+
DAG.getNode(Opc, dl, VT, R, DAG.getConstant(AmtB, dl, VT));
3011130111
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
3011230112
}
3011330113
}

llvm/test/CodeGen/X86/combine-sdiv.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,7 +2190,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
21902190
; SSE41-NEXT: pxor %xmm4, %xmm4
21912191
; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
21922192
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2193-
; SSE41-NEXT: psllw $1, %xmm2
2193+
; SSE41-NEXT: paddw %xmm2, %xmm2
21942194
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
21952195
; SSE41-NEXT: psrlw $8, %xmm2
21962196
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
@@ -2202,9 +2202,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22022202
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
22032203
; SSE41-NEXT: psraw $8, %xmm0
22042204
; SSE41-NEXT: movdqa %xmm0, %xmm3
2205-
; SSE41-NEXT: psllw $1, %xmm3
2206-
; SSE41-NEXT: psllw $7, %xmm0
2207-
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2205+
; SSE41-NEXT: psllw $7, %xmm3
2206+
; SSE41-NEXT: paddw %xmm0, %xmm0
2207+
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
22082208
; SSE41-NEXT: psrlw $8, %xmm0
22092209
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22102210
; SSE41-NEXT: psraw $8, %xmm2
@@ -2225,7 +2225,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22252225
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
22262226
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
22272227
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2228-
; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
2228+
; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
22292229
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
22302230
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
22312231
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -2235,9 +2235,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
22352235
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
22362236
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
22372237
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
2238-
; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3
2239-
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
2240-
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2238+
; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
2239+
; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
2240+
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
22412241
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
22422242
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
22432243
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1

llvm/test/CodeGen/X86/lower-vec-shift.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -265,11 +265,11 @@ define <16 x i16> @test11(<16 x i16> %a) {
265265
; AVX1-LABEL: test11:
266266
; AVX1: # %bb.0:
267267
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
268-
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
269-
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
270-
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
268+
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
269+
; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
270+
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6],xmm1[7]
271271
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
272-
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
272+
; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
273273
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
274274
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
275275
; AVX1-NEXT: retq
@@ -294,18 +294,18 @@ define <16 x i16> @test12(<16 x i16> %a) {
294294
; AVX1: # %bb.0:
295295
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
296296
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
297-
; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
297+
; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
298298
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
299299
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
300-
; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
300+
; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
301301
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
302302
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
303303
; AVX1-NEXT: retq
304304
;
305305
; AVX2-LABEL: test12:
306306
; AVX2: # %bb.0:
307307
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
308-
; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
308+
; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
309309
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
310310
; AVX2-NEXT: retq
311311
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>

llvm/test/CodeGen/X86/vec_shift6.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,20 @@ define <8 x i16> @test2(<8 x i16> %a) {
2828
; SSE2-LABEL: test2:
2929
; SSE2: # %bb.0:
3030
; SSE2-NEXT: movdqa %xmm0, %xmm1
31-
; SSE2-NEXT: psllw $1, %xmm1
31+
; SSE2-NEXT: paddw %xmm0, %xmm1
3232
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3333
; SSE2-NEXT: retq
3434
;
3535
; SSE41-LABEL: test2:
3636
; SSE41: # %bb.0:
3737
; SSE41-NEXT: movdqa %xmm0, %xmm1
38-
; SSE41-NEXT: psllw $1, %xmm1
38+
; SSE41-NEXT: paddw %xmm0, %xmm1
3939
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4040
; SSE41-NEXT: retq
4141
;
4242
; AVX-LABEL: test2:
4343
; AVX: # %bb.0:
44-
; AVX-NEXT: vpsllw $1, %xmm0, %xmm1
44+
; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm1
4545
; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4646
; AVX-NEXT: retq
4747
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
@@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
5656
; SSE2-LABEL: test3:
5757
; SSE2: # %bb.0:
5858
; SSE2-NEXT: movdqa %xmm0, %xmm1
59-
; SSE2-NEXT: pslld $1, %xmm1
59+
; SSE2-NEXT: paddd %xmm0, %xmm1
6060
; SSE2-NEXT: pslld $2, %xmm0
6161
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
6262
; SSE2-NEXT: retq
@@ -65,7 +65,7 @@ define <4 x i32> @test3(<4 x i32> %a) {
6565
; SSE41: # %bb.0:
6666
; SSE41-NEXT: movdqa %xmm0, %xmm1
6767
; SSE41-NEXT: pslld $2, %xmm1
68-
; SSE41-NEXT: pslld $1, %xmm0
68+
; SSE41-NEXT: paddd %xmm0, %xmm0
6969
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
7070
; SSE41-NEXT: retq
7171
;
@@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
8181
; SSE2-LABEL: test4:
8282
; SSE2: # %bb.0:
8383
; SSE2-NEXT: movdqa %xmm0, %xmm1
84-
; SSE2-NEXT: pslld $1, %xmm1
84+
; SSE2-NEXT: paddd %xmm0, %xmm1
8585
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
8686
; SSE2-NEXT: retq
8787
;
8888
; SSE41-LABEL: test4:
8989
; SSE41: # %bb.0:
9090
; SSE41-NEXT: movdqa %xmm0, %xmm1
91-
; SSE41-NEXT: pslld $1, %xmm1
91+
; SSE41-NEXT: paddd %xmm0, %xmm1
9292
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
9393
; SSE41-NEXT: retq
9494
;

llvm/test/CodeGen/X86/widen_arith-4.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
6565
; SSE41-NEXT: psubw %xmm0, %xmm1
6666
; SSE41-NEXT: movdqa %xmm1, %xmm2
6767
; SSE41-NEXT: psllw $2, %xmm2
68-
; SSE41-NEXT: psllw $1, %xmm1
68+
; SSE41-NEXT: paddw %xmm1, %xmm1
6969
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
7070
; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
7171
; SSE41-NEXT: movq %xmm2, (%rcx,%rax)

0 commit comments

Comments
 (0)