Skip to content

Commit 0bd9255

Browse files
authored
[X86] Improve KnownBits for X86ISD::PSADBW nodes (llvm#83830)
Don't just return the known zero upperbits, compute the absdiff Knownbits and perform the horizontal sum. Add implementations that handle both the X86ISD::PSADBW nodes and the INTRINSIC_WO_CHAIN intrinsics (pre-legalization).
1 parent c371ee9 commit 0bd9255

File tree

3 files changed

+63
-71
lines changed

3 files changed

+63
-71
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36738,6 +36738,26 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
3673836738
return TLO.CombineTo(Op, NewOp);
3673936739
}
3674036740

36741+
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS,
36742+
KnownBits &Known,
36743+
const APInt &DemandedElts,
36744+
const SelectionDAG &DAG, unsigned Depth) {
36745+
KnownBits Known2;
36746+
unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36747+
APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36748+
Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36749+
Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36750+
Known = KnownBits::absdiff(Known, Known2).zext(16);
36751+
// Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36752+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36753+
Known, Known);
36754+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36755+
Known, Known);
36756+
Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36757+
Known, Known);
36758+
Known = Known.zext(64);
36759+
}
36760+
3674136761
void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3674236762
KnownBits &Known,
3674336763
const APInt &DemandedElts,
@@ -36887,12 +36907,13 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3688736907
break;
3688836908
}
3688936909
case X86ISD::PSADBW: {
36910+
SDValue LHS = Op.getOperand(0);
36911+
SDValue RHS = Op.getOperand(1);
3689036912
assert(VT.getScalarType() == MVT::i64 &&
36891-
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
36913+
LHS.getValueType() == RHS.getValueType() &&
36914+
LHS.getValueType().getScalarType() == MVT::i8 &&
3689236915
"Unexpected PSADBW types");
36893-
36894-
// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.
36895-
Known.Zero.setBitsFrom(16);
36916+
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
3689636917
break;
3689736918
}
3689836919
case X86ISD::PCMPGT:
@@ -37046,6 +37067,23 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
3704637067
}
3704737068
break;
3704837069
}
37070+
case ISD::INTRINSIC_WO_CHAIN: {
37071+
switch (Op->getConstantOperandVal(0)) {
37072+
case Intrinsic::x86_sse2_psad_bw:
37073+
case Intrinsic::x86_avx2_psad_bw:
37074+
case Intrinsic::x86_avx512_psad_bw_512: {
37075+
SDValue LHS = Op.getOperand(1);
37076+
SDValue RHS = Op.getOperand(2);
37077+
assert(VT.getScalarType() == MVT::i64 &&
37078+
LHS.getValueType() == RHS.getValueType() &&
37079+
LHS.getValueType().getScalarType() == MVT::i8 &&
37080+
"Unexpected PSADBW types");
37081+
computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37082+
break;
37083+
}
37084+
}
37085+
break;
37086+
}
3704937087
}
3705037088

3705137089
// Handle target shuffles.

llvm/test/CodeGen/X86/psadbw.ll

Lines changed: 18 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,14 @@ define i64 @combine_psadbw_demandedelt(<16 x i8> %0, <16 x i8> %1) nounwind {
5050
define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
5151
; X86-SSE-LABEL: combine_psadbw_cmp_knownbits:
5252
; X86-SSE: # %bb.0:
53-
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
54-
; X86-SSE-NEXT: pxor %xmm1, %xmm1
55-
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
56-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
57-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
58-
; X86-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
53+
; X86-SSE-NEXT: xorps %xmm0, %xmm0
5954
; X86-SSE-NEXT: retl
6055
;
6156
; X64-SSE-LABEL: combine_psadbw_cmp_knownbits:
6257
; X64-SSE: # %bb.0:
6358
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6459
; X64-SSE-NEXT: pxor %xmm1, %xmm1
65-
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
66-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
67-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
60+
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
6861
; X64-SSE-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
6962
; X64-SSE-NEXT: retq
7063
;
@@ -82,96 +75,67 @@ define <2 x i64> @combine_psadbw_cmp_knownbits(<16 x i8> %a0) nounwind {
8275
ret <2 x i64> %ext
8376
}
8477

85-
; TODO: No need to scalarize the sitofp as the PSADBW results are smaller than i32.
78+
; No need to scalarize the sitofp as the PSADBW results are smaller than i32.
8679
define <2 x double> @combine_psadbw_sitofp_knownbits(<16 x i8> %a0) nounwind {
8780
; X86-SSE-LABEL: combine_psadbw_sitofp_knownbits:
8881
; X86-SSE: # %bb.0:
89-
; X86-SSE-NEXT: pushl %ebp
90-
; X86-SSE-NEXT: movl %esp, %ebp
91-
; X86-SSE-NEXT: andl $-8, %esp
92-
; X86-SSE-NEXT: subl $32, %esp
9382
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
9483
; X86-SSE-NEXT: pxor %xmm1, %xmm1
9584
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
96-
; X86-SSE-NEXT: movq %xmm1, {{[0-9]+}}(%esp)
97-
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
98-
; X86-SSE-NEXT: movq %xmm0, {{[0-9]+}}(%esp)
99-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
100-
; X86-SSE-NEXT: fstpl {{[0-9]+}}(%esp)
101-
; X86-SSE-NEXT: fildll {{[0-9]+}}(%esp)
102-
; X86-SSE-NEXT: fstpl (%esp)
103-
; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
104-
; X86-SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
105-
; X86-SSE-NEXT: movl %ebp, %esp
106-
; X86-SSE-NEXT: popl %ebp
85+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
86+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
10787
; X86-SSE-NEXT: retl
10888
;
10989
; X64-SSE-LABEL: combine_psadbw_sitofp_knownbits:
11090
; X64-SSE: # %bb.0:
11191
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
11292
; X64-SSE-NEXT: pxor %xmm1, %xmm1
11393
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
114-
; X64-SSE-NEXT: movd %xmm1, %eax
115-
; X64-SSE-NEXT: xorps %xmm0, %xmm0
116-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm0
117-
; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
118-
; X64-SSE-NEXT: movd %xmm1, %eax
119-
; X64-SSE-NEXT: xorps %xmm1, %xmm1
120-
; X64-SSE-NEXT: cvtsi2sd %eax, %xmm1
121-
; X64-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
94+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
95+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
12296
; X64-SSE-NEXT: retq
12397
;
12498
; AVX2-LABEL: combine_psadbw_sitofp_knownbits:
12599
; AVX2: # %bb.0:
126100
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
127101
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
128102
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
129-
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm1
130-
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
131-
; AVX2-NEXT: vcvtsi2sd %eax, %xmm2, %xmm0
132-
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
103+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
104+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
133105
; AVX2-NEXT: retq
134106
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
135107
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)
136108
%cvt = sitofp <2 x i64> %sad to <2 x double>
137109
ret <2 x double> %cvt
138110
}
139111

140-
; TODO: Convert from uitofp to sitofp as the PSADBW results are zero-extended.
112+
; Convert from uitofp to sitofp as the PSADBW results are zero-extended.
141113
define <2 x double> @combine_psadbw_uitofp_knownbits(<16 x i8> %a0) nounwind {
142114
; X86-SSE-LABEL: combine_psadbw_uitofp_knownbits:
143115
; X86-SSE: # %bb.0:
144116
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
145117
; X86-SSE-NEXT: pxor %xmm1, %xmm1
146-
; X86-SSE-NEXT: psadbw %xmm1, %xmm0
147-
; X86-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
148-
; X86-SSE-NEXT: movapd {{.*#+}} xmm1 = [0,1160773632,0,1160773632]
149-
; X86-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
150-
; X86-SSE-NEXT: addpd %xmm1, %xmm0
118+
; X86-SSE-NEXT: psadbw %xmm0, %xmm1
119+
; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
120+
; X86-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
151121
; X86-SSE-NEXT: retl
152122
;
153123
; X64-SSE-LABEL: combine_psadbw_uitofp_knownbits:
154124
; X64-SSE: # %bb.0:
155125
; X64-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
156126
; X64-SSE-NEXT: pxor %xmm1, %xmm1
157-
; X64-SSE-NEXT: psadbw %xmm1, %xmm0
158-
; X64-SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
159-
; X64-SSE-NEXT: movapd {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
160-
; X64-SSE-NEXT: subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
161-
; X64-SSE-NEXT: addpd %xmm1, %xmm0
127+
; X64-SSE-NEXT: psadbw %xmm0, %xmm1
128+
; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
129+
; X64-SSE-NEXT: cvtdq2pd %xmm0, %xmm0
162130
; X64-SSE-NEXT: retq
163131
;
164132
; AVX2-LABEL: combine_psadbw_uitofp_knownbits:
165133
; AVX2: # %bb.0:
166134
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
167135
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
168136
; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
169-
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
170-
; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171-
; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4985484787499139072,4985484787499139072]
172-
; AVX2-NEXT: # xmm1 = mem[0,0]
173-
; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
174-
; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
137+
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
138+
; AVX2-NEXT: vcvtdq2pd %xmm0, %xmm0
175139
; AVX2-NEXT: retq
176140
%mask = and <16 x i8> %a0, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
177141
%sad = tail call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %mask, <16 x i8> zeroinitializer)

llvm/test/CodeGen/X86/sad.ll

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -989,9 +989,7 @@ define dso_local i32 @sad_unroll_nonzero_initial(ptr %arg, ptr %arg1, ptr %arg2,
989989
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
990990
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
991991
; SSE2-NEXT: paddd %xmm2, %xmm0
992-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
993-
; SSE2-NEXT: paddd %xmm0, %xmm1
994-
; SSE2-NEXT: movd %xmm1, %eax
992+
; SSE2-NEXT: movd %xmm0, %eax
995993
; SSE2-NEXT: retq
996994
;
997995
; AVX-LABEL: sad_unroll_nonzero_initial:
@@ -1053,9 +1051,7 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10531051
; SSE2-NEXT: paddd %xmm1, %xmm2
10541052
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
10551053
; SSE2-NEXT: paddd %xmm2, %xmm0
1056-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1057-
; SSE2-NEXT: por %xmm0, %xmm1
1058-
; SSE2-NEXT: movd %xmm1, %eax
1054+
; SSE2-NEXT: movd %xmm0, %eax
10591055
; SSE2-NEXT: retq
10601056
;
10611057
; AVX-LABEL: sad_double_reduction:
@@ -1067,8 +1063,6 @@ define dso_local i32 @sad_double_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %
10671063
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
10681064
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
10691065
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1070-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1071-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
10721066
; AVX-NEXT: vmovd %xmm0, %eax
10731067
; AVX-NEXT: retq
10741068
bb:
@@ -1115,9 +1109,7 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11151109
; SSE2-NEXT: paddd %xmm1, %xmm2
11161110
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
11171111
; SSE2-NEXT: paddd %xmm2, %xmm0
1118-
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1119-
; SSE2-NEXT: por %xmm0, %xmm1
1120-
; SSE2-NEXT: movd %xmm1, %eax
1112+
; SSE2-NEXT: movd %xmm0, %eax
11211113
; SSE2-NEXT: retq
11221114
;
11231115
; AVX-LABEL: sad_double_reduction_abs:
@@ -1129,8 +1121,6 @@ define dso_local i32 @sad_double_reduction_abs(ptr %arg, ptr %arg1, ptr %arg2, p
11291121
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
11301122
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
11311123
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
1132-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1133-
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
11341124
; AVX-NEXT: vmovd %xmm0, %eax
11351125
; AVX-NEXT: retq
11361126
bb:

0 commit comments

Comments
 (0)