Skip to content

Commit e094abd

Browse files
authored
[SelectionDAG] Expand [US]CMP using arithmetic on boolean values instead of selects (#98774)
The previous expansion of [US]CMP was done using two selects and two compares. It produced decent code, but on many platforms it is better to implement [US]CMP nodes by performing the following operation: ``` [us]cmp(x, y) = (x [us]> y) - (x [us]< y) ``` This patch adds this new expansion, as well as a hook in TargetLowering to allow some targets to still use the select-based approach. AArch64 and SystemZ are currently the only targets to prefer the former approach, but other targets may also start to use it if it provides for better codegen.
1 parent bb604ae commit e094abd

File tree

20 files changed

+4391
-2853
lines changed

20 files changed

+4391
-2853
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3391,6 +3391,10 @@ class TargetLoweringBase {
33913391
return isOperationLegalOrCustom(Op, VT);
33923392
}
33933393

3394+
/// Should we expand [US]CMP nodes using two selects and two compares, or by
3395+
/// doing arithmetic on boolean types
3396+
virtual bool shouldExpandCmpUsingSelects() const { return false; }
3397+
33943398
/// Does this target support complex deinterleaving
33953399
virtual bool isComplexDeinterleavingSupported() const { return false; }
33963400

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10391,14 +10391,28 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
1039110391

1039210392
auto LTPredicate = (Opcode == ISD::UCMP ? ISD::SETULT : ISD::SETLT);
1039310393
auto GTPredicate = (Opcode == ISD::UCMP ? ISD::SETUGT : ISD::SETGT);
10394-
1039510394
SDValue IsLT = DAG.getSetCC(dl, BoolVT, LHS, RHS, LTPredicate);
1039610395
SDValue IsGT = DAG.getSetCC(dl, BoolVT, LHS, RHS, GTPredicate);
10397-
SDValue SelectZeroOrOne =
10398-
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
10399-
DAG.getConstant(0, dl, ResVT));
10400-
return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
10401-
SelectZeroOrOne);
10396+
10397+
// We can't perform arithmetic on i1 values. Extending them would
10398+
// probably result in worse codegen, so let's just use two selects instead.
10399+
// Some targets are also just better off using selects rather than subtraction
10400+
// because one of the conditions can be merged with one of the selects.
10401+
// And finally, if we don't know the contents of high bits of a boolean value
10402+
// we can't perform any arithmetic either.
10403+
if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
10404+
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
10405+
SDValue SelectZeroOrOne =
10406+
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
10407+
DAG.getConstant(0, dl, ResVT));
10408+
return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
10409+
SelectZeroOrOne);
10410+
}
10411+
10412+
if (getBooleanContents(BoolVT) == ZeroOrNegativeOneBooleanContent)
10413+
std::swap(IsGT, IsLT);
10414+
return DAG.getSExtOrTrunc(DAG.getNode(ISD::SUB, dl, BoolVT, IsGT, IsLT), dl,
10415+
ResVT);
1040210416
}
1040310417

1040410418
SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,8 @@ class AArch64TargetLowering : public TargetLowering {
907907

908908
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
909909

910+
bool shouldExpandCmpUsingSelects() const override { return true; }
911+
910912
bool isComplexDeinterleavingSupported() const override;
911913
bool isComplexDeinterleavingOperationSupported(
912914
ComplexDeinterleavingOperation Operation, Type *Ty) const override;

llvm/lib/Target/SystemZ/SystemZISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,8 @@ class SystemZTargetLowering : public TargetLowering {
507507

508508
bool shouldConsiderGEPOffsetSplit() const override { return true; }
509509

510+
bool shouldExpandCmpUsingSelects() const override { return true; }
511+
510512
const char *getTargetNodeName(unsigned Opcode) const override;
511513
std::pair<unsigned, const TargetRegisterClass *>
512514
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

llvm/test/CodeGen/ARM/scmp.ll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
3+
4+
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
5+
; CHECK-LABEL: scmp_8_8:
6+
; CHECK: @ %bb.0:
7+
; CHECK-NEXT: cmp r0, r1
8+
; CHECK-NEXT: mov r0, #0
9+
; CHECK-NEXT: mov r2, #0
10+
; CHECK-NEXT: movwlt r0, #1
11+
; CHECK-NEXT: movwgt r2, #1
12+
; CHECK-NEXT: sub r0, r2, r0
13+
; CHECK-NEXT: bx lr
14+
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
15+
ret i8 %1
16+
}
17+
18+
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
19+
; CHECK-LABEL: scmp_8_16:
20+
; CHECK: @ %bb.0:
21+
; CHECK-NEXT: cmp r0, r1
22+
; CHECK-NEXT: mov r0, #0
23+
; CHECK-NEXT: mov r2, #0
24+
; CHECK-NEXT: movwlt r0, #1
25+
; CHECK-NEXT: movwgt r2, #1
26+
; CHECK-NEXT: sub r0, r2, r0
27+
; CHECK-NEXT: bx lr
28+
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
29+
ret i8 %1
30+
}
31+
32+
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
33+
; CHECK-LABEL: scmp_8_32:
34+
; CHECK: @ %bb.0:
35+
; CHECK-NEXT: cmp r0, r1
36+
; CHECK-NEXT: mov r0, #0
37+
; CHECK-NEXT: mov r2, #0
38+
; CHECK-NEXT: movwlt r0, #1
39+
; CHECK-NEXT: movwgt r2, #1
40+
; CHECK-NEXT: sub r0, r2, r0
41+
; CHECK-NEXT: bx lr
42+
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
43+
ret i8 %1
44+
}
45+
46+
define i8 @scmp_8_64(i64 %x, i64 %y) nounwind {
47+
; CHECK-LABEL: scmp_8_64:
48+
; CHECK: @ %bb.0:
49+
; CHECK-NEXT: .save {r11, lr}
50+
; CHECK-NEXT: push {r11, lr}
51+
; CHECK-NEXT: subs lr, r0, r2
52+
; CHECK-NEXT: mov r12, #0
53+
; CHECK-NEXT: sbcs lr, r1, r3
54+
; CHECK-NEXT: mov lr, #0
55+
; CHECK-NEXT: movwlt lr, #1
56+
; CHECK-NEXT: subs r0, r2, r0
57+
; CHECK-NEXT: sbcs r0, r3, r1
58+
; CHECK-NEXT: movwlt r12, #1
59+
; CHECK-NEXT: sub r0, r12, lr
60+
; CHECK-NEXT: pop {r11, pc}
61+
%1 = call i8 @llvm.scmp(i64 %x, i64 %y)
62+
ret i8 %1
63+
}
64+
65+
define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
66+
; CHECK-LABEL: scmp_8_128:
67+
; CHECK: @ %bb.0:
68+
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
69+
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
70+
; CHECK-NEXT: ldr r4, [sp, #24]
71+
; CHECK-NEXT: mov r5, #0
72+
; CHECK-NEXT: ldr r6, [sp, #28]
73+
; CHECK-NEXT: subs r7, r0, r4
74+
; CHECK-NEXT: ldr r12, [sp, #32]
75+
; CHECK-NEXT: sbcs r7, r1, r6
76+
; CHECK-NEXT: ldr lr, [sp, #36]
77+
; CHECK-NEXT: sbcs r7, r2, r12
78+
; CHECK-NEXT: sbcs r7, r3, lr
79+
; CHECK-NEXT: mov r7, #0
80+
; CHECK-NEXT: movwlt r7, #1
81+
; CHECK-NEXT: subs r0, r4, r0
82+
; CHECK-NEXT: sbcs r0, r6, r1
83+
; CHECK-NEXT: sbcs r0, r12, r2
84+
; CHECK-NEXT: sbcs r0, lr, r3
85+
; CHECK-NEXT: movwlt r5, #1
86+
; CHECK-NEXT: sub r0, r5, r7
87+
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
88+
%1 = call i8 @llvm.scmp(i128 %x, i128 %y)
89+
ret i8 %1
90+
}
91+
92+
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
93+
; CHECK-LABEL: scmp_32_32:
94+
; CHECK: @ %bb.0:
95+
; CHECK-NEXT: cmp r0, r1
96+
; CHECK-NEXT: mov r0, #0
97+
; CHECK-NEXT: mov r2, #0
98+
; CHECK-NEXT: movwlt r0, #1
99+
; CHECK-NEXT: movwgt r2, #1
100+
; CHECK-NEXT: sub r0, r2, r0
101+
; CHECK-NEXT: bx lr
102+
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
103+
ret i32 %1
104+
}
105+
106+
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
107+
; CHECK-LABEL: scmp_32_64:
108+
; CHECK: @ %bb.0:
109+
; CHECK-NEXT: .save {r11, lr}
110+
; CHECK-NEXT: push {r11, lr}
111+
; CHECK-NEXT: subs lr, r0, r2
112+
; CHECK-NEXT: mov r12, #0
113+
; CHECK-NEXT: sbcs lr, r1, r3
114+
; CHECK-NEXT: mov lr, #0
115+
; CHECK-NEXT: movwlt lr, #1
116+
; CHECK-NEXT: subs r0, r2, r0
117+
; CHECK-NEXT: sbcs r0, r3, r1
118+
; CHECK-NEXT: movwlt r12, #1
119+
; CHECK-NEXT: sub r0, r12, lr
120+
; CHECK-NEXT: pop {r11, pc}
121+
%1 = call i32 @llvm.scmp(i64 %x, i64 %y)
122+
ret i32 %1
123+
}
124+
125+
define i64 @scmp_64_64(i64 %x, i64 %y) nounwind {
126+
; CHECK-LABEL: scmp_64_64:
127+
; CHECK: @ %bb.0:
128+
; CHECK-NEXT: .save {r11, lr}
129+
; CHECK-NEXT: push {r11, lr}
130+
; CHECK-NEXT: subs lr, r0, r2
131+
; CHECK-NEXT: mov r12, #0
132+
; CHECK-NEXT: sbcs lr, r1, r3
133+
; CHECK-NEXT: mov lr, #0
134+
; CHECK-NEXT: movwlt lr, #1
135+
; CHECK-NEXT: subs r0, r2, r0
136+
; CHECK-NEXT: sbcs r0, r3, r1
137+
; CHECK-NEXT: movwlt r12, #1
138+
; CHECK-NEXT: sub r0, r12, lr
139+
; CHECK-NEXT: asr r1, r0, #31
140+
; CHECK-NEXT: pop {r11, pc}
141+
%1 = call i64 @llvm.scmp(i64 %x, i64 %y)
142+
ret i64 %1
143+
}

llvm/test/CodeGen/ARM/ucmp.ll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
3+
4+
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
5+
; CHECK-LABEL: ucmp_8_8:
6+
; CHECK: @ %bb.0:
7+
; CHECK-NEXT: cmp r0, r1
8+
; CHECK-NEXT: mov r0, #0
9+
; CHECK-NEXT: mov r2, #0
10+
; CHECK-NEXT: movwlo r0, #1
11+
; CHECK-NEXT: movwhi r2, #1
12+
; CHECK-NEXT: sub r0, r2, r0
13+
; CHECK-NEXT: bx lr
14+
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
15+
ret i8 %1
16+
}
17+
18+
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
19+
; CHECK-LABEL: ucmp_8_16:
20+
; CHECK: @ %bb.0:
21+
; CHECK-NEXT: cmp r0, r1
22+
; CHECK-NEXT: mov r0, #0
23+
; CHECK-NEXT: mov r2, #0
24+
; CHECK-NEXT: movwlo r0, #1
25+
; CHECK-NEXT: movwhi r2, #1
26+
; CHECK-NEXT: sub r0, r2, r0
27+
; CHECK-NEXT: bx lr
28+
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
29+
ret i8 %1
30+
}
31+
32+
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
33+
; CHECK-LABEL: ucmp_8_32:
34+
; CHECK: @ %bb.0:
35+
; CHECK-NEXT: cmp r0, r1
36+
; CHECK-NEXT: mov r0, #0
37+
; CHECK-NEXT: mov r2, #0
38+
; CHECK-NEXT: movwlo r0, #1
39+
; CHECK-NEXT: movwhi r2, #1
40+
; CHECK-NEXT: sub r0, r2, r0
41+
; CHECK-NEXT: bx lr
42+
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
43+
ret i8 %1
44+
}
45+
46+
define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
47+
; CHECK-LABEL: ucmp_8_64:
48+
; CHECK: @ %bb.0:
49+
; CHECK-NEXT: .save {r11, lr}
50+
; CHECK-NEXT: push {r11, lr}
51+
; CHECK-NEXT: subs lr, r0, r2
52+
; CHECK-NEXT: mov r12, #0
53+
; CHECK-NEXT: sbcs lr, r1, r3
54+
; CHECK-NEXT: mov lr, #0
55+
; CHECK-NEXT: movwlo lr, #1
56+
; CHECK-NEXT: subs r0, r2, r0
57+
; CHECK-NEXT: sbcs r0, r3, r1
58+
; CHECK-NEXT: movwlo r12, #1
59+
; CHECK-NEXT: sub r0, r12, lr
60+
; CHECK-NEXT: pop {r11, pc}
61+
%1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
62+
ret i8 %1
63+
}
64+
65+
define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
66+
; CHECK-LABEL: ucmp_8_128:
67+
; CHECK: @ %bb.0:
68+
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
69+
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
70+
; CHECK-NEXT: ldr r4, [sp, #24]
71+
; CHECK-NEXT: mov r5, #0
72+
; CHECK-NEXT: ldr r6, [sp, #28]
73+
; CHECK-NEXT: subs r7, r0, r4
74+
; CHECK-NEXT: ldr r12, [sp, #32]
75+
; CHECK-NEXT: sbcs r7, r1, r6
76+
; CHECK-NEXT: ldr lr, [sp, #36]
77+
; CHECK-NEXT: sbcs r7, r2, r12
78+
; CHECK-NEXT: sbcs r7, r3, lr
79+
; CHECK-NEXT: mov r7, #0
80+
; CHECK-NEXT: movwlo r7, #1
81+
; CHECK-NEXT: subs r0, r4, r0
82+
; CHECK-NEXT: sbcs r0, r6, r1
83+
; CHECK-NEXT: sbcs r0, r12, r2
84+
; CHECK-NEXT: sbcs r0, lr, r3
85+
; CHECK-NEXT: movwlo r5, #1
86+
; CHECK-NEXT: sub r0, r5, r7
87+
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
88+
%1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
89+
ret i8 %1
90+
}
91+
92+
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
93+
; CHECK-LABEL: ucmp_32_32:
94+
; CHECK: @ %bb.0:
95+
; CHECK-NEXT: cmp r0, r1
96+
; CHECK-NEXT: mov r0, #0
97+
; CHECK-NEXT: mov r2, #0
98+
; CHECK-NEXT: movwlo r0, #1
99+
; CHECK-NEXT: movwhi r2, #1
100+
; CHECK-NEXT: sub r0, r2, r0
101+
; CHECK-NEXT: bx lr
102+
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
103+
ret i32 %1
104+
}
105+
106+
define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
107+
; CHECK-LABEL: ucmp_32_64:
108+
; CHECK: @ %bb.0:
109+
; CHECK-NEXT: .save {r11, lr}
110+
; CHECK-NEXT: push {r11, lr}
111+
; CHECK-NEXT: subs lr, r0, r2
112+
; CHECK-NEXT: mov r12, #0
113+
; CHECK-NEXT: sbcs lr, r1, r3
114+
; CHECK-NEXT: mov lr, #0
115+
; CHECK-NEXT: movwlo lr, #1
116+
; CHECK-NEXT: subs r0, r2, r0
117+
; CHECK-NEXT: sbcs r0, r3, r1
118+
; CHECK-NEXT: movwlo r12, #1
119+
; CHECK-NEXT: sub r0, r12, lr
120+
; CHECK-NEXT: pop {r11, pc}
121+
%1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
122+
ret i32 %1
123+
}
124+
125+
define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
126+
; CHECK-LABEL: ucmp_64_64:
127+
; CHECK: @ %bb.0:
128+
; CHECK-NEXT: .save {r11, lr}
129+
; CHECK-NEXT: push {r11, lr}
130+
; CHECK-NEXT: subs lr, r0, r2
131+
; CHECK-NEXT: mov r12, #0
132+
; CHECK-NEXT: sbcs lr, r1, r3
133+
; CHECK-NEXT: mov lr, #0
134+
; CHECK-NEXT: movwlo lr, #1
135+
; CHECK-NEXT: subs r0, r2, r0
136+
; CHECK-NEXT: sbcs r0, r3, r1
137+
; CHECK-NEXT: movwlo r12, #1
138+
; CHECK-NEXT: sub r0, r12, lr
139+
; CHECK-NEXT: asr r1, r0, #31
140+
; CHECK-NEXT: pop {r11, pc}
141+
%1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
142+
ret i64 %1
143+
}

0 commit comments

Comments
 (0)