Skip to content

Commit 9bf30fe

Browse files
committed
[X86] For minsize, use size for alignment, rather than actual alignment
If we have minsize, then don't care about the alignment. On x86, the CPU doesn't care and neither should you. As long as the count is aligned, we can use less instructions.
1 parent 206a3ea commit 9bf30fe

File tree

1 file changed

+86
-38
lines changed

1 file changed

+86
-38
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Lines changed: 86 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,30 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6666
// If not DWORD aligned or size is more than the threshold, call the library.
6767
// The libc version is likely to be faster for these cases. It can use the
6868
// address value and run time information about the CPU.
69-
if (Alignment < Align(4) || !ConstantSize ||
70-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
69+
if (!ConstantSize ||
70+
(!AlwaysInline &&
71+
(Alignment < Align(4) ||
72+
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())))
7173
return SDValue();
7274

75+
// If we have minsize, then don't care about the alignment.
76+
// On x86, the CPU doesn't care and neither should you.
77+
// As long as the count is aligned, we can use the minimum number of
78+
// instructions without always having to resort to stosb.
79+
//
80+
// Because this is a feature specific to x86, we must handle it here.
7381
uint64_t SizeVal = ConstantSize->getZExtValue();
82+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
83+
if ((SizeVal & 7) == 0 && Subtarget.is64Bit())
84+
Alignment = Align(8);
85+
else if ((SizeVal & 3) == 0)
86+
Alignment = Align(4);
87+
else if ((SizeVal & 1) == 0)
88+
Alignment = Align(2);
89+
else
90+
Alignment = Align(1);
91+
}
92+
7493
SDValue InGlue;
7594
EVT AVT;
7695
SDValue Count;
@@ -86,7 +105,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86105
ValReg = X86::EAX;
87106
Val = (Val << 8) | Val;
88107
Val = (Val << 16) | Val;
89-
if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
108+
if (Subtarget.is64Bit() && Alignment > Align(4)) { // QWORD aligned
90109
AVT = MVT::i64;
91110
ValReg = X86::RAX;
92111
Val = (Val << 32) | Val;
@@ -103,12 +122,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103122
Count = DAG.getIntPtrConstant(SizeVal, dl);
104123
}
105124

106-
if (AVT.bitsGT(MVT::i8)) {
107-
unsigned UBytes = AVT.getSizeInBits() / 8;
108-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
109-
BytesLeft = SizeVal % UBytes;
110-
}
111-
125+
const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
126+
const uint64_t BlockCount = SizeVal / BlockBytes;
127+
Count = DAG.getIntPtrConstant(BlockCount, dl);
128+
BytesLeft = SizeVal % BlockBytes;
112129
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
113130
InGlue);
114131
InGlue = Chain.getValue(1);
@@ -120,34 +137,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120137
}
121138

122139
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
123-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124-
Count, InGlue);
140+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
141+
InGlue);
125142
InGlue = Chain.getValue(1);
126-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127-
Dst, InGlue);
143+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
144+
InGlue);
128145
InGlue = Chain.getValue(1);
129146

130147
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
131-
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
132-
Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
133-
134-
if (BytesLeft) {
135-
// Handle the last 1 - 7 bytes.
136-
unsigned Offset = SizeVal - BytesLeft;
137-
EVT AddrVT = Dst.getValueType();
138-
EVT SizeVT = Size.getValueType();
139-
140-
Chain =
141-
DAG.getMemset(Chain, dl,
142-
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
143-
DAG.getConstant(Offset, dl, AddrVT)),
144-
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
145-
isVolatile, AlwaysInline,
146-
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
147-
}
148+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
149+
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
150+
151+
/// RepStos can process the whole length.
152+
//
153+
// Because we changed the alignment earlier in the function to work on size
154+
// when we have the minsize attribute, this is guaranteed to be 0 when we get
155+
// here.
156+
if (BytesLeft == 0)
157+
return RepStos;
148158

149-
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150-
return Chain;
159+
// Handle the last 1 - 7 bytes.
160+
SmallVector<SDValue, 4> Results;
161+
Results.push_back(RepStos);
162+
unsigned Offset = SizeVal - BytesLeft;
163+
EVT AddrVT = Dst.getValueType();
164+
EVT SizeVT = Size.getValueType();
165+
166+
Results.push_back(
167+
DAG.getMemset(Chain, dl,
168+
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
169+
DAG.getConstant(Offset, dl, AddrVT)),
170+
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
171+
isVolatile, /* AlwaysInline */ true,
172+
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));
173+
174+
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
151175
}
152176

153177
/// Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +244,42 @@ static SDValue emitConstantSizeRepmov(
220244
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
221245
/// We assume runtime memcpy will do a better job for unaligned copies when
222246
/// ERMS is not present.
223-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
247+
if (!AlwaysInline && (Alignment < Align(4)))
224248
return SDValue();
225249

250+
// If we have minsize, then don't care about the alignment.
251+
// On x86, the CPU doesn't care and neither should you.
252+
// As long as the count is aligned, we can use the minimum number of
253+
// instructions without always having to resort to movsb
254+
//
255+
// Because this is a feature specific to x86, we must handle it here.
256+
257+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
258+
if ((Size & 15) == 0 && Subtarget.is64Bit())
259+
Alignment = Align(16);
260+
else if ((Size & 7) == 0)
261+
Alignment = Align(8);
262+
else if ((Size & 3) == 0)
263+
Alignment = Align(4);
264+
else if ((Size & 1) == 0)
265+
Alignment = Align(2);
266+
else
267+
Alignment = Align(1);
268+
}
269+
226270
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
227271
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
228272
const uint64_t BlockCount = Size / BlockBytes;
229273
const uint64_t BytesLeft = Size % BlockBytes;
274+
275+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
276+
// Use the one instruction determined. Because we changed the alignment
277+
// earlier in the function to work on size when we have the minsize
278+
// attribute, it is guaranteed to process the entire length.
279+
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
280+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
281+
}
282+
230283
SDValue RepMovs =
231284
emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
232285
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
@@ -237,11 +290,6 @@ static SDValue emitConstantSizeRepmov(
237290

238291
assert(BytesLeft && "We have leftover at this point");
239292

240-
/// In case we optimize for size we use repmovsb even if it's less efficient
241-
/// so we can save the loads/stores of the leftover.
242-
if (DAG.getMachineFunction().getFunction().hasMinSize())
243-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245293
// Handle the last 1 - 7 bytes.
246294
SmallVector<SDValue, 4> Results;
247295
Results.push_back(RepMovs);

0 commit comments

Comments
 (0)