Skip to content

Commit e88fbe6

Browse files
committed
[X86] Use AlwaysInline to determine whether to emit code or bail when inlining is determined to be unprofitable
Assume true when we get to the getMemset code, as it ought to be profitable by then. Just like how getMemcpy works.
1 parent e84a757 commit e88fbe6

File tree

2 files changed

+66
-142
lines changed

2 files changed

+66
-142
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

+62-11
Original file line numberDiff line numberDiff line change
@@ -63,14 +63,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6363
if (DstPtrInfo.getAddrSpace() >= 256)
6464
return SDValue();
6565

66+
if (!ConstantSize)
67+
return SDValue();
68+
6669
// If not DWORD aligned or size is more than the threshold, call the library.
6770
// The libc version is likely to be faster for these cases. It can use the
6871
// address value and run time information about the CPU.
69-
if (Alignment < Align(4) || !ConstantSize ||
70-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
71-
return SDValue();
72+
//
73+
// If we MUST inline, then don't care about the alignment.
74+
// On x86, the CPU doesn't care and neither should you.
75+
// As long as the count is aligned, we can use less instructions.
76+
//
77+
// They're not slower than reading the same value byte by byte
78+
// They're slower than aligned reads.
79+
// But we don't have the option here.
80+
// Really, the slowdown comes from the fact that, if the value straddles
81+
// cache lines, the CPU has to read in two cache lines (likewise with
82+
// writes) but that's unavoidable here, since we have to read the range one
83+
// way or another so we take the penalty either way.
84+
//
85+
// We need to instead change the alignment to be based on the actual type.
7286

7387
uint64_t SizeVal = ConstantSize->getZExtValue();
88+
if (AlwaysInline || DAG.getMachineFunction().getFunction().hasMinSize()) {
89+
if (SizeVal & 7 == 0 && Subtarget.is64Bit())
90+
Alignment = Align(8);
91+
else if (SizeVal & 3 == 0)
92+
Alignment = Align(4);
93+
else if (SizeVal & 1 == 0)
94+
Alignment = Align(2);
95+
else
96+
Alignment = Align(1);
97+
} else if (Alignment < Align(4) ||
98+
SizeVal > Subtarget.getMaxInlineSizeThreshold())
99+
return SDValue();
100+
74101
SDValue InGlue;
75102
EVT AVT;
76103
SDValue Count;
@@ -142,7 +169,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
142169
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
143170
DAG.getConstant(Offset, dl, AddrVT)),
144171
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
145-
isVolatile, AlwaysInline,
172+
isVolatile, /* AlwaysInline */ true,
146173
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
147174
}
148175

@@ -208,19 +235,43 @@ static SDValue emitConstantSizeRepmov(
208235
Align Alignment, bool isVolatile, bool AlwaysInline,
209236
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
210237

211-
/// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
212-
/// efficient.
213-
if (!AlwaysInline && Size > Subtarget.getMaxInlineSizeThreshold())
214-
return SDValue();
238+
// If we MUST inline, then don't care about the alignment.
239+
// On x86, the CPU doesn't care and neither should you
240+
// as long as the count is aligned.
241+
//
242+
// They're not slower than reading the same value byte by byte
243+
// They're slower than aligned reads.
244+
// But we don't have the option here.
245+
// Really, the slowdown comes from the fact that, if the value straddles
246+
// cache lines, the CPU has to read in two cache lines (likewise with
247+
// writes) but that's unavoidable here, since you have to read the range one
248+
// way or another so you we take the penalty either way.
249+
250+
// We need to instead change the alignment to be based on the actual type.
251+
252+
/// In case we optimize for size we use repmovsb even if it's less efficient
253+
/// so we can save the loads/stores of the leftover.
215254

216255
/// If we have enhanced repmovs we use it.
217256
if (Subtarget.hasERMSB())
218257
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
219258

220259
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
221-
/// We assume runtime memcpy will do a better job for unaligned copies when
222-
/// ERMS is not present.
223-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
260+
261+
if (AlwaysInline || DAG.getMachineFunction().getFunction().hasMinSize()) {
262+
if (Size & 7 == 0 && Subtarget.is64Bit())
263+
Alignment = Align(8);
264+
else if (Size & 3 == 0)
265+
Alignment = Align(4);
266+
else if (Size & 1 == 0)
267+
Alignment = Align(2);
268+
else
269+
Alignment = Align(1);
270+
271+
/// We assume runtime memcpy will do a better job for unaligned copies when
272+
/// ERMS is not present.
273+
} else if (Alignment < Align(4) ||
274+
Size > Subtarget.getMaxInlineSizeThreshold())
224275
return SDValue();
225276

226277
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);

llvm/test/CodeGen/X86/memset-vs-memset-inline.ll

+4-131
Original file line numberDiff line numberDiff line change
@@ -28,137 +28,10 @@ define void @regular_memset_calls_external_function(ptr %a, i8 %value) nounwind
2828
define void @inlined_set_doesnt_call_external_function(ptr %a, i8 %value) nounwind {
2929
; CHECK-LABEL: inlined_set_doesnt_call_external_function:
3030
; CHECK: # %bb.0:
31-
; CHECK-NEXT: movzbl %sil, %ecx
32-
; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101
33-
; CHECK-NEXT: imulq %rcx, %rax
34-
; CHECK-NEXT: movq %rax, 1016(%rdi)
35-
; CHECK-NEXT: movq %rax, 1008(%rdi)
36-
; CHECK-NEXT: movq %rax, 1000(%rdi)
37-
; CHECK-NEXT: movq %rax, 992(%rdi)
38-
; CHECK-NEXT: movq %rax, 984(%rdi)
39-
; CHECK-NEXT: movq %rax, 976(%rdi)
40-
; CHECK-NEXT: movq %rax, 968(%rdi)
41-
; CHECK-NEXT: movq %rax, 960(%rdi)
42-
; CHECK-NEXT: movq %rax, 952(%rdi)
43-
; CHECK-NEXT: movq %rax, 944(%rdi)
44-
; CHECK-NEXT: movq %rax, 936(%rdi)
45-
; CHECK-NEXT: movq %rax, 928(%rdi)
46-
; CHECK-NEXT: movq %rax, 920(%rdi)
47-
; CHECK-NEXT: movq %rax, 912(%rdi)
48-
; CHECK-NEXT: movq %rax, 904(%rdi)
49-
; CHECK-NEXT: movq %rax, 896(%rdi)
50-
; CHECK-NEXT: movq %rax, 888(%rdi)
51-
; CHECK-NEXT: movq %rax, 880(%rdi)
52-
; CHECK-NEXT: movq %rax, 872(%rdi)
53-
; CHECK-NEXT: movq %rax, 864(%rdi)
54-
; CHECK-NEXT: movq %rax, 856(%rdi)
55-
; CHECK-NEXT: movq %rax, 848(%rdi)
56-
; CHECK-NEXT: movq %rax, 840(%rdi)
57-
; CHECK-NEXT: movq %rax, 832(%rdi)
58-
; CHECK-NEXT: movq %rax, 824(%rdi)
59-
; CHECK-NEXT: movq %rax, 816(%rdi)
60-
; CHECK-NEXT: movq %rax, 808(%rdi)
61-
; CHECK-NEXT: movq %rax, 800(%rdi)
62-
; CHECK-NEXT: movq %rax, 792(%rdi)
63-
; CHECK-NEXT: movq %rax, 784(%rdi)
64-
; CHECK-NEXT: movq %rax, 776(%rdi)
65-
; CHECK-NEXT: movq %rax, 768(%rdi)
66-
; CHECK-NEXT: movq %rax, 760(%rdi)
67-
; CHECK-NEXT: movq %rax, 752(%rdi)
68-
; CHECK-NEXT: movq %rax, 744(%rdi)
69-
; CHECK-NEXT: movq %rax, 736(%rdi)
70-
; CHECK-NEXT: movq %rax, 728(%rdi)
71-
; CHECK-NEXT: movq %rax, 720(%rdi)
72-
; CHECK-NEXT: movq %rax, 712(%rdi)
73-
; CHECK-NEXT: movq %rax, 704(%rdi)
74-
; CHECK-NEXT: movq %rax, 696(%rdi)
75-
; CHECK-NEXT: movq %rax, 688(%rdi)
76-
; CHECK-NEXT: movq %rax, 680(%rdi)
77-
; CHECK-NEXT: movq %rax, 672(%rdi)
78-
; CHECK-NEXT: movq %rax, 664(%rdi)
79-
; CHECK-NEXT: movq %rax, 656(%rdi)
80-
; CHECK-NEXT: movq %rax, 648(%rdi)
81-
; CHECK-NEXT: movq %rax, 640(%rdi)
82-
; CHECK-NEXT: movq %rax, 632(%rdi)
83-
; CHECK-NEXT: movq %rax, 624(%rdi)
84-
; CHECK-NEXT: movq %rax, 616(%rdi)
85-
; CHECK-NEXT: movq %rax, 608(%rdi)
86-
; CHECK-NEXT: movq %rax, 600(%rdi)
87-
; CHECK-NEXT: movq %rax, 592(%rdi)
88-
; CHECK-NEXT: movq %rax, 584(%rdi)
89-
; CHECK-NEXT: movq %rax, 576(%rdi)
90-
; CHECK-NEXT: movq %rax, 568(%rdi)
91-
; CHECK-NEXT: movq %rax, 560(%rdi)
92-
; CHECK-NEXT: movq %rax, 552(%rdi)
93-
; CHECK-NEXT: movq %rax, 544(%rdi)
94-
; CHECK-NEXT: movq %rax, 536(%rdi)
95-
; CHECK-NEXT: movq %rax, 528(%rdi)
96-
; CHECK-NEXT: movq %rax, 520(%rdi)
97-
; CHECK-NEXT: movq %rax, 512(%rdi)
98-
; CHECK-NEXT: movq %rax, 504(%rdi)
99-
; CHECK-NEXT: movq %rax, 496(%rdi)
100-
; CHECK-NEXT: movq %rax, 488(%rdi)
101-
; CHECK-NEXT: movq %rax, 480(%rdi)
102-
; CHECK-NEXT: movq %rax, 472(%rdi)
103-
; CHECK-NEXT: movq %rax, 464(%rdi)
104-
; CHECK-NEXT: movq %rax, 456(%rdi)
105-
; CHECK-NEXT: movq %rax, 448(%rdi)
106-
; CHECK-NEXT: movq %rax, 440(%rdi)
107-
; CHECK-NEXT: movq %rax, 432(%rdi)
108-
; CHECK-NEXT: movq %rax, 424(%rdi)
109-
; CHECK-NEXT: movq %rax, 416(%rdi)
110-
; CHECK-NEXT: movq %rax, 408(%rdi)
111-
; CHECK-NEXT: movq %rax, 400(%rdi)
112-
; CHECK-NEXT: movq %rax, 392(%rdi)
113-
; CHECK-NEXT: movq %rax, 384(%rdi)
114-
; CHECK-NEXT: movq %rax, 376(%rdi)
115-
; CHECK-NEXT: movq %rax, 368(%rdi)
116-
; CHECK-NEXT: movq %rax, 360(%rdi)
117-
; CHECK-NEXT: movq %rax, 352(%rdi)
118-
; CHECK-NEXT: movq %rax, 344(%rdi)
119-
; CHECK-NEXT: movq %rax, 336(%rdi)
120-
; CHECK-NEXT: movq %rax, 328(%rdi)
121-
; CHECK-NEXT: movq %rax, 320(%rdi)
122-
; CHECK-NEXT: movq %rax, 312(%rdi)
123-
; CHECK-NEXT: movq %rax, 304(%rdi)
124-
; CHECK-NEXT: movq %rax, 296(%rdi)
125-
; CHECK-NEXT: movq %rax, 288(%rdi)
126-
; CHECK-NEXT: movq %rax, 280(%rdi)
127-
; CHECK-NEXT: movq %rax, 272(%rdi)
128-
; CHECK-NEXT: movq %rax, 264(%rdi)
129-
; CHECK-NEXT: movq %rax, 256(%rdi)
130-
; CHECK-NEXT: movq %rax, 248(%rdi)
131-
; CHECK-NEXT: movq %rax, 240(%rdi)
132-
; CHECK-NEXT: movq %rax, 232(%rdi)
133-
; CHECK-NEXT: movq %rax, 224(%rdi)
134-
; CHECK-NEXT: movq %rax, 216(%rdi)
135-
; CHECK-NEXT: movq %rax, 208(%rdi)
136-
; CHECK-NEXT: movq %rax, 200(%rdi)
137-
; CHECK-NEXT: movq %rax, 192(%rdi)
138-
; CHECK-NEXT: movq %rax, 184(%rdi)
139-
; CHECK-NEXT: movq %rax, 176(%rdi)
140-
; CHECK-NEXT: movq %rax, 168(%rdi)
141-
; CHECK-NEXT: movq %rax, 160(%rdi)
142-
; CHECK-NEXT: movq %rax, 152(%rdi)
143-
; CHECK-NEXT: movq %rax, 144(%rdi)
144-
; CHECK-NEXT: movq %rax, 136(%rdi)
145-
; CHECK-NEXT: movq %rax, 128(%rdi)
146-
; CHECK-NEXT: movq %rax, 120(%rdi)
147-
; CHECK-NEXT: movq %rax, 112(%rdi)
148-
; CHECK-NEXT: movq %rax, 104(%rdi)
149-
; CHECK-NEXT: movq %rax, 96(%rdi)
150-
; CHECK-NEXT: movq %rax, 88(%rdi)
151-
; CHECK-NEXT: movq %rax, 80(%rdi)
152-
; CHECK-NEXT: movq %rax, 72(%rdi)
153-
; CHECK-NEXT: movq %rax, 64(%rdi)
154-
; CHECK-NEXT: movq %rax, 56(%rdi)
155-
; CHECK-NEXT: movq %rax, 48(%rdi)
156-
; CHECK-NEXT: movq %rax, 40(%rdi)
157-
; CHECK-NEXT: movq %rax, 32(%rdi)
158-
; CHECK-NEXT: movq %rax, 24(%rdi)
159-
; CHECK-NEXT: movq %rax, 16(%rdi)
160-
; CHECK-NEXT: movq %rax, 8(%rdi)
161-
; CHECK-NEXT: movq %rax, (%rdi)
31+
; CHECK-NEXT: movl %esi, %eax
32+
; CHECK-NEXT: movl $1024, %ecx # imm = 0x400
33+
; CHECK-NEXT: # kill: def $al killed $al killed $eax
34+
; CHECK-NEXT: rep;stosb %al, %es:(%rdi)
16235
; CHECK-NEXT: retq
16336
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 1024, i1 0)
16437
ret void

0 commit comments

Comments
 (0)