Skip to content

Commit 48d26c2

Browse files
committed
[X86] Enable TuningSlowDivide64 on Barcelona/Bobcat/Bulldozer/Ryzen Families
Despite most AMD cpus having a lower latency for i64 divisions that converge early, we are still better off testing for values representable as i32 and performing a i32 division if possible. All AMD cpus appear to have been missed when we added the "idivq-to-divl" attribute - now matches most Intel cpu behaviour (and the x86-64/v2/3/4 levels). Unfortunately the difference in code scheduling means I've had to stop using the update_llc_test_checks script and just use a old-fashing CHECK-DAG check for divl/divq pairs. Fixes #90985
1 parent 3809e20 commit 48d26c2

File tree

2 files changed

+28
-116
lines changed

2 files changed

+28
-116
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,6 +1341,7 @@ def ProcessorFeatures {
13411341
FeatureCMOV,
13421342
FeatureX86_64];
13431343
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
1344+
TuningSlowDivide64,
13441345
TuningSlowSHLD,
13451346
TuningSBBDepBreaking,
13461347
TuningInsertVZEROUPPER];
@@ -1363,6 +1364,7 @@ def ProcessorFeatures {
13631364
list<SubtargetFeature> BtVer1Tuning = [TuningFast15ByteNOP,
13641365
TuningFastScalarShiftMasks,
13651366
TuningFastVectorShiftMasks,
1367+
TuningSlowDivide64,
13661368
TuningSlowSHLD,
13671369
TuningSBBDepBreaking,
13681370
TuningInsertVZEROUPPER];
@@ -1385,6 +1387,7 @@ def ProcessorFeatures {
13851387
TuningFastVectorShiftMasks,
13861388
TuningFastMOVBE,
13871389
TuningSBBDepBreaking,
1390+
TuningSlowDivide64,
13881391
TuningSlowSHLD];
13891392
list<SubtargetFeature> BtVer2Features =
13901393
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1409,6 +1412,7 @@ def ProcessorFeatures {
14091412
FeatureLWP,
14101413
FeatureLAHFSAHF64];
14111414
list<SubtargetFeature> BdVer1Tuning = [TuningSlowSHLD,
1415+
TuningSlowDivide64,
14121416
TuningFast11ByteNOP,
14131417
TuningFastScalarShiftMasks,
14141418
TuningBranchFusion,
@@ -1488,6 +1492,7 @@ def ProcessorFeatures {
14881492
TuningFastScalarShiftMasks,
14891493
TuningFastVariablePerLaneShuffle,
14901494
TuningFastMOVBE,
1495+
TuningSlowDivide64,
14911496
TuningSlowSHLD,
14921497
TuningSBBDepBreaking,
14931498
TuningInsertVZEROUPPER,

llvm/test/CodeGen/X86/bypass-slow-division-64.ll

Lines changed: 23 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
21
; Check that 64-bit division is bypassed correctly.
32
; RUN: llc < %s -mtriple=x86_64-- -mattr=-idivq-to-divl | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
43
; RUN: llc < %s -mtriple=x86_64-- -mattr=+idivq-to-divl | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
@@ -13,17 +12,17 @@
1312
; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
1413
; RUN: llc < %s -mtriple=x86_64-- -mcpu=alderlake | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
1514
; AMD
16-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
17-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
18-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
19-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
20-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
21-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
22-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
23-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
24-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
25-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
26-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,FAST-DIVQ
15+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=barcelona | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
16+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
17+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
18+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
19+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
20+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
21+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
22+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
23+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
24+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
25+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
2726

2827
; Additional tests for 64-bit divide bypass
2928

@@ -40,22 +39,8 @@ define i64 @sdiv_quotient(i64 %a, i64 %b) nounwind {
4039
; FAST-DIVQ-NEXT: retq
4140
;
4241
; SLOW-DIVQ-LABEL: sdiv_quotient:
43-
; SLOW-DIVQ: # %bb.0:
44-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
45-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
46-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
47-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
48-
; SLOW-DIVQ-NEXT: je .LBB0_1
49-
; SLOW-DIVQ-NEXT: # %bb.2:
50-
; SLOW-DIVQ-NEXT: cqto
51-
; SLOW-DIVQ-NEXT: idivq %rsi
52-
; SLOW-DIVQ-NEXT: retq
53-
; SLOW-DIVQ-NEXT: .LBB0_1:
54-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
55-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
56-
; SLOW-DIVQ-NEXT: divl %esi
57-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
58-
; SLOW-DIVQ-NEXT: retq
42+
; SLOW-DIVQ-DAG: idivq %rsi
43+
; SLOW-DIVQ-DAG: divl %esi
5944
%result = sdiv i64 %a, %b
6045
ret i64 %result
6146
}
@@ -92,23 +77,8 @@ define i64 @sdiv_remainder(i64 %a, i64 %b) nounwind {
9277
; FAST-DIVQ-NEXT: retq
9378
;
9479
; SLOW-DIVQ-LABEL: sdiv_remainder:
95-
; SLOW-DIVQ: # %bb.0:
96-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
97-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
98-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
99-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
100-
; SLOW-DIVQ-NEXT: je .LBB3_1
101-
; SLOW-DIVQ-NEXT: # %bb.2:
102-
; SLOW-DIVQ-NEXT: cqto
103-
; SLOW-DIVQ-NEXT: idivq %rsi
104-
; SLOW-DIVQ-NEXT: movq %rdx, %rax
105-
; SLOW-DIVQ-NEXT: retq
106-
; SLOW-DIVQ-NEXT: .LBB3_1:
107-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
108-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
109-
; SLOW-DIVQ-NEXT: divl %esi
110-
; SLOW-DIVQ-NEXT: movl %edx, %eax
111-
; SLOW-DIVQ-NEXT: retq
80+
; SLOW-DIVQ-DAG: idivq %rsi
81+
; SLOW-DIVQ-DAG: divl %esi
11282
%result = srem i64 %a, %b
11383
ret i64 %result
11484
}
@@ -147,25 +117,8 @@ define i64 @sdiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
147117
; FAST-DIVQ-NEXT: retq
148118
;
149119
; SLOW-DIVQ-LABEL: sdiv_quotient_and_remainder:
150-
; SLOW-DIVQ: # %bb.0:
151-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
152-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
153-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
154-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
155-
; SLOW-DIVQ-NEXT: je .LBB6_1
156-
; SLOW-DIVQ-NEXT: # %bb.2:
157-
; SLOW-DIVQ-NEXT: cqto
158-
; SLOW-DIVQ-NEXT: idivq %rsi
159-
; SLOW-DIVQ-NEXT: addq %rdx, %rax
160-
; SLOW-DIVQ-NEXT: retq
161-
; SLOW-DIVQ-NEXT: .LBB6_1:
162-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
163-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
164-
; SLOW-DIVQ-NEXT: divl %esi
165-
; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
166-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
167-
; SLOW-DIVQ-NEXT: addq %rdx, %rax
168-
; SLOW-DIVQ-NEXT: retq
120+
; SLOW-DIVQ-DAG: idivq %rsi
121+
; SLOW-DIVQ-DAG: divl %esi
169122
%resultdiv = sdiv i64 %a, %b
170123
%resultrem = srem i64 %a, %b
171124
%result = add i64 %resultdiv, %resultrem
@@ -213,22 +166,8 @@ define i64 @udiv_quotient(i64 %a, i64 %b) nounwind {
213166
; FAST-DIVQ-NEXT: retq
214167
;
215168
; SLOW-DIVQ-LABEL: udiv_quotient:
216-
; SLOW-DIVQ: # %bb.0:
217-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
218-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
219-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
220-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
221-
; SLOW-DIVQ-NEXT: je .LBB9_1
222-
; SLOW-DIVQ-NEXT: # %bb.2:
223-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
224-
; SLOW-DIVQ-NEXT: divq %rsi
225-
; SLOW-DIVQ-NEXT: retq
226-
; SLOW-DIVQ-NEXT: .LBB9_1:
227-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
228-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
229-
; SLOW-DIVQ-NEXT: divl %esi
230-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
231-
; SLOW-DIVQ-NEXT: retq
169+
; SLOW-DIVQ-DAG: divq %rsi
170+
; SLOW-DIVQ-DAG: divl %esi
232171
%result = udiv i64 %a, %b
233172
ret i64 %result
234173
}
@@ -265,23 +204,8 @@ define i64 @udiv_remainder(i64 %a, i64 %b) nounwind {
265204
; FAST-DIVQ-NEXT: retq
266205
;
267206
; SLOW-DIVQ-LABEL: udiv_remainder:
268-
; SLOW-DIVQ: # %bb.0:
269-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
270-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
271-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
272-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
273-
; SLOW-DIVQ-NEXT: je .LBB12_1
274-
; SLOW-DIVQ-NEXT: # %bb.2:
275-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
276-
; SLOW-DIVQ-NEXT: divq %rsi
277-
; SLOW-DIVQ-NEXT: movq %rdx, %rax
278-
; SLOW-DIVQ-NEXT: retq
279-
; SLOW-DIVQ-NEXT: .LBB12_1:
280-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
281-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
282-
; SLOW-DIVQ-NEXT: divl %esi
283-
; SLOW-DIVQ-NEXT: movl %edx, %eax
284-
; SLOW-DIVQ-NEXT: retq
207+
; SLOW-DIVQ-DAG: divq %rsi
208+
; SLOW-DIVQ-DAG: divl %esi
285209
%result = urem i64 %a, %b
286210
ret i64 %result
287211
}
@@ -320,25 +244,8 @@ define i64 @udiv_quotient_and_remainder(i64 %a, i64 %b) nounwind {
320244
; FAST-DIVQ-NEXT: retq
321245
;
322246
; SLOW-DIVQ-LABEL: udiv_quotient_and_remainder:
323-
; SLOW-DIVQ: # %bb.0:
324-
; SLOW-DIVQ-NEXT: movq %rdi, %rax
325-
; SLOW-DIVQ-NEXT: movq %rdi, %rcx
326-
; SLOW-DIVQ-NEXT: orq %rsi, %rcx
327-
; SLOW-DIVQ-NEXT: shrq $32, %rcx
328-
; SLOW-DIVQ-NEXT: je .LBB15_1
329-
; SLOW-DIVQ-NEXT: # %bb.2:
330-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
331-
; SLOW-DIVQ-NEXT: divq %rsi
332-
; SLOW-DIVQ-NEXT: addq %rdx, %rax
333-
; SLOW-DIVQ-NEXT: retq
334-
; SLOW-DIVQ-NEXT: .LBB15_1:
335-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax killed $rax
336-
; SLOW-DIVQ-NEXT: xorl %edx, %edx
337-
; SLOW-DIVQ-NEXT: divl %esi
338-
; SLOW-DIVQ-NEXT: # kill: def $edx killed $edx def $rdx
339-
; SLOW-DIVQ-NEXT: # kill: def $eax killed $eax def $rax
340-
; SLOW-DIVQ-NEXT: addq %rdx, %rax
341-
; SLOW-DIVQ-NEXT: retq
247+
; SLOW-DIVQ-DAG: divq %rsi
248+
; SLOW-DIVQ-DAG: divl %esi
342249
%resultdiv = udiv i64 %a, %b
343250
%resultrem = urem i64 %a, %b
344251
%result = add i64 %resultdiv, %resultrem

0 commit comments

Comments
 (0)