Skip to content

Commit 640c7ca

Browse files
committed
[X86] For minsize, use size for alignment, rather than actual alignment
If we have minsize, then don't care about the alignment. On x86, the CPU doesn't care and neither should you. As long as the count is aligned, we can use less instructions.
1 parent e4cf55a commit 640c7ca

File tree

3 files changed

+78
-300
lines changed

3 files changed

+78
-300
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

+70-39
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,23 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6666
// If not DWORD aligned or size is more than the threshold, call the library.
6767
// The libc version is likely to be faster for these cases. It can use the
6868
// address value and run time information about the CPU.
69-
if (Alignment < Align(4) || !ConstantSize ||
70-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
69+
if (!ConstantSize)
7170
return SDValue();
7271

7372
uint64_t SizeVal = ConstantSize->getZExtValue();
73+
if (!AlwaysInline &&
74+
(Alignment < Align(4) || SizeVal > Subtarget.getMaxInlineSizeThreshold()))
75+
return SDValue();
76+
77+
// If we have minsize, then don't care about the alignment.
78+
// On x86, the CPU doesn't care and neither should you.
79+
// As long as the count is aligned, we can use the minimum number of
80+
// instructions without always having to resort to stosb.
81+
//
82+
// Because this is a feature specific to x86, we must handle it here.
83+
if (DAG.getMachineFunction().getFunction().hasMinSize())
84+
Alignment = commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), SizeVal);
85+
7486
SDValue InGlue;
7587
EVT AVT;
7688
SDValue Count;
@@ -80,13 +92,13 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
8092
uint64_t Val = ValC->getZExtValue() & 255;
8193

8294
// If the value is a constant, then we can potentially use larger sets.
83-
if (Alignment > Align(2)) {
95+
if (Alignment >= Align(4)) {
8496
// DWORD aligned
8597
AVT = MVT::i32;
8698
ValReg = X86::EAX;
8799
Val = (Val << 8) | Val;
88100
Val = (Val << 16) | Val;
89-
if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
101+
if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
90102
AVT = MVT::i64;
91103
ValReg = X86::RAX;
92104
Val = (Val << 32) | Val;
@@ -103,12 +115,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103115
Count = DAG.getIntPtrConstant(SizeVal, dl);
104116
}
105117

106-
if (AVT.bitsGT(MVT::i8)) {
107-
unsigned UBytes = AVT.getSizeInBits() / 8;
108-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
109-
BytesLeft = SizeVal % UBytes;
110-
}
111-
118+
const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
119+
const uint64_t BlockCount = SizeVal / BlockBytes;
120+
Count = DAG.getIntPtrConstant(BlockCount, dl);
121+
BytesLeft = SizeVal % BlockBytes;
112122
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
113123
InGlue);
114124
InGlue = Chain.getValue(1);
@@ -120,34 +130,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120130
}
121131

122132
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
123-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124-
Count, InGlue);
133+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
134+
InGlue);
125135
InGlue = Chain.getValue(1);
126-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127-
Dst, InGlue);
136+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
137+
InGlue);
128138
InGlue = Chain.getValue(1);
129139

130140
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
131-
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
132-
Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
133-
134-
if (BytesLeft) {
135-
// Handle the last 1 - 7 bytes.
136-
unsigned Offset = SizeVal - BytesLeft;
137-
EVT AddrVT = Dst.getValueType();
138-
EVT SizeVT = Size.getValueType();
139-
140-
Chain =
141-
DAG.getMemset(Chain, dl,
142-
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
143-
DAG.getConstant(Offset, dl, AddrVT)),
144-
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
145-
isVolatile, AlwaysInline,
146-
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
147-
}
141+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
142+
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
143+
144+
/// RepStos can process the whole length.
145+
//
146+
// Because we changed the alignment earlier in the function to work on size
147+
// when we have the minsize attribute, this is guaranteed to be 0 when we get
148+
// here.
149+
if (BytesLeft == 0)
150+
return RepStos;
148151

149-
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150-
return Chain;
152+
// Handle the last 1 - 7 bytes.
153+
SmallVector<SDValue, 4> Results;
154+
Results.push_back(RepStos);
155+
unsigned Offset = SizeVal - BytesLeft;
156+
EVT AddrVT = Dst.getValueType();
157+
EVT SizeVT = Size.getValueType();
158+
159+
Results.push_back(
160+
DAG.getMemset(Chain, dl,
161+
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
162+
DAG.getConstant(Offset, dl, AddrVT)),
163+
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
164+
isVolatile, /* AlwaysInline */ true,
165+
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));
166+
167+
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
151168
}
152169

153170
/// Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +237,32 @@ static SDValue emitConstantSizeRepmov(
220237
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
221238
/// We assume runtime memcpy will do a better job for unaligned copies when
222239
/// ERMS is not present.
223-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
240+
if (!AlwaysInline && (Alignment < Align(4)))
224241
return SDValue();
225242

243+
// If we have minsize, then don't care about the alignment.
244+
// On x86, the CPU doesn't care and neither should you.
245+
// As long as the count is aligned, we can use the minimum number of
246+
// instructions without always having to resort to movsb
247+
//
248+
// Because this is a feature specific to x86, we must handle it here.
249+
250+
if (DAG.getMachineFunction().getFunction().hasMinSize())
251+
Alignment = commonAlignment(Align(Subtarget.is64Bit() ? 8 : 4), Size);
252+
226253
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
227254
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
228255
const uint64_t BlockCount = Size / BlockBytes;
229256
const uint64_t BytesLeft = Size % BlockBytes;
257+
258+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
259+
// Use the one instruction determined. Because we changed the alignment
260+
// earlier in the function to work on size when we have the minsize
261+
// attribute, it is guaranteed to process the entire length.
262+
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
263+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
264+
}
265+
230266
SDValue RepMovs =
231267
emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
232268
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
@@ -237,11 +273,6 @@ static SDValue emitConstantSizeRepmov(
237273

238274
assert(BytesLeft && "We have leftover at this point");
239275

240-
/// In case we optimize for size we use repmovsb even if it's less efficient
241-
/// so we can save the loads/stores of the leftover.
242-
if (DAG.getMachineFunction().getFunction().hasMinSize())
243-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245276
// Handle the last 1 - 7 bytes.
246277
SmallVector<SDValue, 4> Results;
247278
Results.push_back(RepMovs);

llvm/test/CodeGen/X86/memset-minsize.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ entry:
1414
define void @small_memset_to_rep_stos(ptr %ptr) minsize nounwind {
1515
; CHECK-LABEL: small_memset_to_rep_stos:
1616
; CHECK: # %bb.0: # %entry
17-
; CHECK-NEXT: pushq $32
17+
; CHECK-NEXT: pushq $16
1818
; CHECK-NEXT: popq %rcx
1919
; CHECK-NEXT: xorl %eax, %eax
20-
; CHECK-NEXT: rep;stosl %eax, %es:(%rdi)
20+
; CHECK-NEXT: rep;stosq %rax, %es:(%rdi)
2121
; CHECK-NEXT: retq
2222
entry:
2323
call void @llvm.memset.p0.i32(ptr align 4 %ptr, i8 0, i32 128, i1 false)

0 commit comments

Comments
 (0)