Skip to content

Commit 4924d91

Browse files
committed
[X86] computeKnownBitsForTargetNode - add INTRINSIC_WO_CHAIN handling for PSADBW intrinsics
Waiting for intrinsics to be lowered to ISD target nodes is causing some poor combine decisions - at the very least we need better value tracking handling. As an initial example I've added support for the PSADBW intrinsics (which can be expanded along with the ISD node in #81765) as this is a good example of an intrinsic that we need to handle as early as possible.
1 parent b0181be commit 4924d91

File tree

3 files changed

+31
-60
lines changed

3 files changed

+31
-60
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36980,6 +36980,18 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3698036980
}
3698136981
break;
3698236982
}
36983+
case ISD::INTRINSIC_WO_CHAIN: {
36984+
switch (Op->getConstantOperandVal(0)) {
36985+
case Intrinsic::x86_sse2_psad_bw:
36986+
case Intrinsic::x86_avx2_psad_bw:
36987+
case Intrinsic::x86_avx512_psad_bw_512:
36988+
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36989+
assert(VT.getScalarType() == MVT::i64 && "Unexpected PSADBW types");
36990+
Known.Zero.setBitsFrom(16);
36991+
break;
36992+
}
36993+
break;
36994+
}
3698336995
}
3698436996

3698536997
// Handle target shuffles.

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 16 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
5454
; X86-SSE-NEXT: pxor %xmm1, %xmm1
5555
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
5656
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
5857
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
5958
; X86-SSE-NEXT: retl
6059
;
@@ -64,7 +63,6 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
6463
; X64-SSE-NEXT: pxor %xmm1, %xmm1
6564
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
6665
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6866
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6967
; X64-SSE-NEXT: retq
7068
;
@@ -82,96 +80,67 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
8280
ret <2 x i64> %ext
8381
}
8482

85-
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
83+
; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
8684
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
8785
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
8886
; X86-SSE: # %bb.0:
89-
; X86-SSE-NEXT: pushl %ebp
90-
; X86-SSE-NEXT: movl %esp, %ebp
91-
; X86-SSE-NEXT: andl $-8, %esp
92-
; X86-SSE-NEXT: subl $32, %esp
9387
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
9488
; X86-SSE-NEXT: pxor %xmm1, %xmm1
9589
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
96-
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98-
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
100-
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
101-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
102-
; X86-SSE-NEXT: fstpl (%esp)
103-
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104-
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105-
; X86-SSE-NEXT: movl %ebp, %esp
106-
; X86-SSE-NEXT: popl %ebp
90+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
91+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
10792
; X86-SSE-NEXT: retl
10893
;
10994
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
11095
; X64-SSE: # %bb.0:
11196
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
11297
; X64-SSE-NEXT: pxor %xmm1, %xmm1
11398
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114-
; X64-SSE-NEXT: movd %xmm1, %eax
115-
; X64-SSE-NEXT: xorps %xmm0, %xmm0
116-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118-
; X64-SSE-NEXT: movd %xmm1, %eax
119-
; X64-SSE-NEXT: xorps %xmm1, %xmm1
120-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121-
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
99+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
100+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
122101
; X64-SSE-NEXT: retq
123102
;
124103
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
125104
; AVX2: # %bb.0:
126105
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127106
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128107
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129-
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130-
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131-
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
108+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
109+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
133110
; AVX2-NEXT: retq
134111
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
135112
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
136113
%cvt = sitofp <2 x i64> %sad to <2 x double>
137114
ret <2 x double> %cvt
138115
}
139116

140-
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
117+
; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
141118
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
142119
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143120
; X86-SSE: # %bb.0:
144121
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145122
; X86-SSE-NEXT: pxor %xmm1, %xmm1
146-
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148-
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149-
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150-
; X86-SSE-NEXT: addpd %xmm1, %xmm0
123+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
124+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
125+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
151126
; X86-SSE-NEXT: retl
152127
;
153128
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154129
; X64-SSE: # %bb.0:
155130
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156131
; X64-SSE-NEXT: pxor %xmm1, %xmm1
157-
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159-
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160-
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161-
; X64-SSE-NEXT: addpd %xmm1, %xmm0
132+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
133+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
134+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
162135
; X64-SSE-NEXT: retq
163136
;
164137
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165138
; AVX2: # %bb.0:
166139
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167140
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168141
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170-
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171-
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172-
; AVX2-NEXT: # xmm1 = mem[0,0]
173-
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174-
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
142+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
143+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
175144
; AVX2-NEXT: retq
176145
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
177146
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)

llvm/test/CodeGen/X86/sad.ll

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
989989
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
990990
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
991991
; SSE2-NEXT: paddd %xmm2, %xmm0
992-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
993-
; SSE2-NEXT: paddd %xmm0, %xmm1
994-
; SSE2-NEXT: movd %xmm1, %eax
992+
; SSE2-NEXT: movd %xmm0, %eax
995993
; SSE2-NEXT: retq
996994
;
997995
; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10531051
; SSE2-NEXT: paddd %xmm1, %xmm2
10541052
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
10551053
; SSE2-NEXT: paddd %xmm2, %xmm0
1056-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1057-
; SSE2-NEXT: por %xmm0, %xmm1
1058-
; SSE2-NEXT: movd %xmm1, %eax
1054+
; SSE2-NEXT: movd %xmm0, %eax
10591055
; SSE2-NEXT: retq
10601056
;
10611057
; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10671063
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
10681064
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
10691065
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1070-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1071-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
10721066
; AVX-NEXT: vmovd %xmm0, %eax
10731067
; AVX-NEXT: retq
10741068
bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11151109
; SSE2-NEXT: paddd %xmm1, %xmm2
11161110
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
11171111
; SSE2-NEXT: paddd %xmm2, %xmm0
1118-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1119-
; SSE2-NEXT: por %xmm0, %xmm1
1120-
; SSE2-NEXT: movd %xmm1, %eax
1112+
; SSE2-NEXT: movd %xmm0, %eax
11211113
; SSE2-NEXT: retq
11221114
;
11231115
; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11291121
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
11301122
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
11311123
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1132-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1133-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
11341124
; AVX-NEXT: vmovd %xmm0, %eax
11351125
; AVX-NEXT: retq
11361126
bb:

0 commit comments

Comments
 (0)