Skip to content

Commit 6aab826

Browse files
rez5427Yui5427arsenmRKSimon
authored
[DAGCombiner] add fold (xor (smin(x, C), C)) and fold (xor (smax(x, C), C)) (#155141)
Hi, I compared the following LLVM IR with GCC and Clang, and there is a small difference between the two. The LLVM IR is: ``` define i64 @test_smin_neg_one(i64 %a) { %1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1) %retval.0 = xor i64 %1, -1 ret i64 %retval.0 } ``` GCC generates: ``` cmp x0, 0 csinv x0, xzr, x0, ge ret ``` Clang generates: ``` cmn x0, #1 csinv x8, x0, xzr, lt mvn x0, x8 ret ``` Clang keeps flipping x0 through x8 unnecessarily. So I added the following folds to DAGCombiner: fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0 fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0 alive2: https://alive2.llvm.org/ce/z/gffoir --------- Co-authored-by: Yui5427 <[email protected]> Co-authored-by: Matt Arsenault <[email protected]> Co-authored-by: Simon Pilgrim <[email protected]>
1 parent d2fbca8 commit 6aab826

File tree

2 files changed

+245
-0
lines changed

2 files changed

+245
-0
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10092,6 +10092,55 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
1009210092
if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
1009310093
return Combined;
1009410094

10095+
// fold (xor (smin(x, C), C)) -> select (x < C), xor(x, C), 0
10096+
// fold (xor (smax(x, C), C)) -> select (x > C), xor(x, C), 0
10097+
// fold (xor (umin(x, C), C)) -> select (x < C), xor(x, C), 0
10098+
// fold (xor (umax(x, C), C)) -> select (x > C), xor(x, C), 0
10099+
SDValue Op0;
10100+
if (sd_match(N0, m_OneUse(m_AnyOf(m_SMin(m_Value(Op0), m_Specific(N1)),
10101+
m_SMax(m_Value(Op0), m_Specific(N1)),
10102+
m_UMin(m_Value(Op0), m_Specific(N1)),
10103+
m_UMax(m_Value(Op0), m_Specific(N1)))))) {
10104+
10105+
if (isa<ConstantSDNode>(N1) ||
10106+
ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
10107+
// For vectors, only optimize when the constant is zero or all-ones to
10108+
// avoid generating more instructions
10109+
if (VT.isVector()) {
10110+
ConstantSDNode *N1C = isConstOrConstSplat(N1);
10111+
if (!N1C || (!N1C->isZero() && !N1C->isAllOnes()))
10112+
return SDValue();
10113+
}
10114+
10115+
// Avoid the fold if the minmax operation is legal and select is expensive
10116+
if (TLI.isOperationLegal(N0.getOpcode(), VT) &&
10117+
TLI.isPredictableSelectExpensive())
10118+
return SDValue();
10119+
10120+
EVT CCVT = getSetCCResultType(VT);
10121+
ISD::CondCode CC;
10122+
switch (N0.getOpcode()) {
10123+
case ISD::SMIN:
10124+
CC = ISD::SETLT;
10125+
break;
10126+
case ISD::SMAX:
10127+
CC = ISD::SETGT;
10128+
break;
10129+
case ISD::UMIN:
10130+
CC = ISD::SETULT;
10131+
break;
10132+
case ISD::UMAX:
10133+
CC = ISD::SETUGT;
10134+
break;
10135+
}
10136+
SDValue FN1 = DAG.getFreeze(N1);
10137+
SDValue Cmp = DAG.getSetCC(DL, CCVT, Op0, FN1, CC);
10138+
SDValue XorXC = DAG.getNode(ISD::XOR, DL, VT, Op0, FN1);
10139+
SDValue Zero = DAG.getConstant(0, DL, VT);
10140+
return DAG.getSelect(DL, VT, Cmp, XorXC, Zero);
10141+
}
10142+
}
10143+
1009510144
return SDValue();
1009610145
}
1009710146

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
3+
4+
; Test for DAGCombiner optimization: fold (xor (smin(x, C), C)) -> select (x < C), xor (x, C), 0
5+
6+
define i64 @test_smin_neg_one(i64 %a) {
7+
; CHECK-LABEL: test_smin_neg_one:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: cmn x0, #1
10+
; CHECK-NEXT: csinv x0, xzr, x0, ge
11+
; CHECK-NEXT: ret
12+
%1 = tail call i64 @llvm.smin.i64(i64 %a, i64 -1)
13+
%retval.0 = xor i64 %1, -1
14+
ret i64 %retval.0
15+
}
16+
17+
define i64 @test_smin_constant(i64 %a) {
18+
; CHECK-LABEL: test_smin_constant:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: eor x8, x0, #0x8
21+
; CHECK-NEXT: cmp x0, #8
22+
; CHECK-NEXT: csel x0, x8, xzr, lt
23+
; CHECK-NEXT: ret
24+
%1 = tail call i64 @llvm.smin.i64(i64 %a, i64 8)
25+
%retval.0 = xor i64 %1, 8
26+
ret i64 %retval.0
27+
}
28+
29+
; Test for DAGCombiner optimization: fold (xor (smax(x, C), C)) -> select (x > C), xor (x, C), 0
30+
define i64 @test_smax_neg_one(i64 %a) {
31+
; CHECK-LABEL: test_smax_neg_one:
32+
; CHECK: // %bb.0:
33+
; CHECK-NEXT: mvn x8, x0
34+
; CHECK-NEXT: bic x0, x8, x0, asr #63
35+
; CHECK-NEXT: ret
36+
%1 = tail call i64 @llvm.smax.i64(i64 %a, i64 -1)
37+
%retval.0 = xor i64 %1, -1
38+
ret i64 %retval.0
39+
}
40+
41+
define i64 @test_smax_constant(i64 %a) {
42+
; CHECK-LABEL: test_smax_constant:
43+
; CHECK: // %bb.0:
44+
; CHECK-NEXT: eor x8, x0, #0x8
45+
; CHECK-NEXT: cmp x0, #8
46+
; CHECK-NEXT: csel x0, x8, xzr, gt
47+
; CHECK-NEXT: ret
48+
%1 = tail call i64 @llvm.smax.i64(i64 %a, i64 8)
49+
%retval.0 = xor i64 %1, 8
50+
ret i64 %retval.0
51+
}
52+
53+
define i64 @test_umin_neg_one(i64 %a) {
54+
; CHECK-LABEL: test_umin_neg_one:
55+
; CHECK: // %bb.0:
56+
; CHECK-NEXT: mvn x0, x0
57+
; CHECK-NEXT: ret
58+
%1 = tail call i64 @llvm.umin.i64(i64 %a, i64 -1)
59+
%retval.0 = xor i64 %1, -1
60+
ret i64 %retval.0
61+
}
62+
63+
define i64 @test_umin_constant(i64 %a) {
64+
; CHECK-LABEL: test_umin_constant:
65+
; CHECK: // %bb.0:
66+
; CHECK-NEXT: eor x8, x0, #0x8
67+
; CHECK-NEXT: cmp x0, #8
68+
; CHECK-NEXT: csel x0, x8, xzr, lo
69+
; CHECK-NEXT: ret
70+
%1 = tail call i64 @llvm.umin.i64(i64 %a, i64 8)
71+
%retval.0 = xor i64 %1, 8
72+
ret i64 %retval.0
73+
}
74+
75+
define i64 @test_umax_neg_one(i64 %a) {
76+
; CHECK-LABEL: test_umax_neg_one:
77+
; CHECK: // %bb.0:
78+
; CHECK-NEXT: mov x0, xzr
79+
; CHECK-NEXT: ret
80+
%1 = tail call i64 @llvm.umax.i64(i64 %a, i64 -1)
81+
%retval.0 = xor i64 %1, -1
82+
ret i64 %retval.0
83+
}
84+
85+
define i64 @test_umax_constant(i64 %a) {
86+
; CHECK-LABEL: test_umax_constant:
87+
; CHECK: // %bb.0:
88+
; CHECK-NEXT: eor x8, x0, #0x8
89+
; CHECK-NEXT: cmp x0, #8
90+
; CHECK-NEXT: csel x0, x8, xzr, hi
91+
; CHECK-NEXT: ret
92+
%1 = tail call i64 @llvm.umax.i64(i64 %a, i64 8)
93+
%retval.0 = xor i64 %1, 8
94+
ret i64 %retval.0
95+
}
96+
97+
; Test vector cases
98+
define <4 x i32> @test_smin_vector_neg_one(<4 x i32> %a) {
99+
; CHECK-LABEL: test_smin_vector_neg_one:
100+
; CHECK: // %bb.0:
101+
; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff
102+
; CHECK-NEXT: cmgt v1.4s, v1.4s, v0.4s
103+
; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
104+
; CHECK-NEXT: ret
105+
%1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
106+
%retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
107+
ret <4 x i32> %retval.0
108+
}
109+
110+
define <4 x i32> @test_smin_vector_constant(<4 x i32> %a) {
111+
; CHECK-LABEL: test_smin_vector_constant:
112+
; CHECK: // %bb.0:
113+
; CHECK-NEXT: movi v1.4s, #8
114+
; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s
115+
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
116+
; CHECK-NEXT: ret
117+
%1 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
118+
%retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
119+
ret <4 x i32> %retval.0
120+
}
121+
122+
define <4 x i32> @test_smax_vector_neg_one(<4 x i32> %a) {
123+
; CHECK-LABEL: test_smax_vector_neg_one:
124+
; CHECK: // %bb.0:
125+
; CHECK-NEXT: cmge v1.4s, v0.4s, #0
126+
; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b
127+
; CHECK-NEXT: ret
128+
%1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
129+
%retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
130+
ret <4 x i32> %retval.0
131+
}
132+
133+
define <4 x i32> @test_smax_vector_constant(<4 x i32> %a) {
134+
; CHECK-LABEL: test_smax_vector_constant:
135+
; CHECK: // %bb.0:
136+
; CHECK-NEXT: movi v1.4s, #8
137+
; CHECK-NEXT: smax v0.4s, v0.4s, v1.4s
138+
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
139+
; CHECK-NEXT: ret
140+
%1 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
141+
%retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
142+
ret <4 x i32> %retval.0
143+
}
144+
145+
define <4 x i32> @test_umin_vector_neg_one(<4 x i32> %a) {
146+
; CHECK-LABEL: test_umin_vector_neg_one:
147+
; CHECK: // %bb.0:
148+
; CHECK-NEXT: mvn v0.16b, v0.16b
149+
; CHECK-NEXT: ret
150+
%1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
151+
%retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
152+
ret <4 x i32> %retval.0
153+
}
154+
155+
define <4 x i32> @test_umin_vector_constant(<4 x i32> %a) {
156+
; CHECK-LABEL: test_umin_vector_constant:
157+
; CHECK: // %bb.0:
158+
; CHECK-NEXT: movi v1.4s, #8
159+
; CHECK-NEXT: umin v0.4s, v0.4s, v1.4s
160+
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
161+
; CHECK-NEXT: ret
162+
%1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
163+
%retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
164+
ret <4 x i32> %retval.0
165+
}
166+
167+
define <4 x i32> @test_umax_vector_neg_one(<4 x i32> %a) {
168+
; CHECK-LABEL: test_umax_vector_neg_one:
169+
; CHECK: // %bb.0:
170+
; CHECK-NEXT: movi v0.2d, #0000000000000000
171+
; CHECK-NEXT: ret
172+
%1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
173+
%retval.0 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
174+
ret <4 x i32> %retval.0
175+
}
176+
177+
define <4 x i32> @test_umax_vector_constant(<4 x i32> %a) {
178+
; CHECK-LABEL: test_umax_vector_constant:
179+
; CHECK: // %bb.0:
180+
; CHECK-NEXT: movi v1.4s, #8
181+
; CHECK-NEXT: umax v0.4s, v0.4s, v1.4s
182+
; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
183+
; CHECK-NEXT: ret
184+
%1 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> <i32 8, i32 8, i32 8, i32 8>)
185+
%retval.0 = xor <4 x i32> %1, <i32 8, i32 8, i32 8, i32 8>
186+
ret <4 x i32> %retval.0
187+
}
188+
189+
declare i64 @llvm.smin.i64(i64, i64)
190+
declare i64 @llvm.smax.i64(i64, i64)
191+
declare i64 @llvm.umin.i64(i64, i64)
192+
declare i64 @llvm.umax.i64(i64, i64)
193+
declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
194+
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
195+
declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
196+
declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)

0 commit comments

Comments
 (0)