Skip to content

Commit a265b4d

Browse files
committed
[X86] For minsize, use size for alignment, rather than actual alignment
If we have minsize, then don't care about the alignment. On x86, the CPU doesn't care and neither should you. As long as the count is aligned, we can use less instructions.
1 parent 1f38ccf commit a265b4d

File tree

1 file changed

+83
-37
lines changed

1 file changed

+83
-37
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

Lines changed: 83 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
6767
// The libc version is likely to be faster for these cases. It can use the
6868
// address value and run time information about the CPU.
6969
if (Alignment < Align(4) || !ConstantSize ||
70-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
70+
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
7171
return SDValue();
7272

73+
// If we have minsize, then don't care about the alignment.
74+
// On x86, the CPU doesn't care and neither should you.
75+
// As long as the count is aligned, we can use the minimum number of
76+
// instructions without always having to resort to stosb.
77+
//
78+
// Because this is a feature specific to x86, we must handle it here.
7379
uint64_t SizeVal = ConstantSize->getZExtValue();
80+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
81+
if ((SizeVal & 7) == 0 && Subtarget.is64Bit())
82+
Alignment = Align(8);
83+
else if ((SizeVal & 3) == 0)
84+
Alignment = Align(4);
85+
else if ((SizeVal & 1) == 0)
86+
Alignment = Align(2);
87+
else
88+
Alignment = Align(1);
89+
}
90+
7491
SDValue InGlue;
7592
EVT AVT;
7693
SDValue Count;
@@ -86,7 +103,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86103
ValReg = X86::EAX;
87104
Val = (Val << 8) | Val;
88105
Val = (Val << 16) | Val;
89-
if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
106+
if (Subtarget.is64Bit() && Alignment > Align(4)) { // QWORD aligned
90107
AVT = MVT::i64;
91108
ValReg = X86::RAX;
92109
Val = (Val << 32) | Val;
@@ -103,12 +120,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103120
Count = DAG.getIntPtrConstant(SizeVal, dl);
104121
}
105122

106-
if (AVT.bitsGT(MVT::i8)) {
107-
unsigned UBytes = AVT.getSizeInBits() / 8;
108-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
109-
BytesLeft = SizeVal % UBytes;
110-
}
111-
123+
const uint64_t BlockBytes = AVT.getSizeInBits() / 8;
124+
const uint64_t BlockCount = SizeVal / BlockBytes;
125+
Count = DAG.getIntPtrConstant(BlockCount, dl);
126+
BytesLeft = SizeVal % BlockBytes;
112127
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
113128
InGlue);
114129
InGlue = Chain.getValue(1);
@@ -120,34 +135,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120135
}
121136

122137
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
123-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124-
Count, InGlue);
138+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
139+
InGlue);
125140
InGlue = Chain.getValue(1);
126-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127-
Dst, InGlue);
141+
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
142+
InGlue);
128143
InGlue = Chain.getValue(1);
129144

130145
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
131-
SDValue Ops[] = { Chain, DAG.getValueType(AVT), InGlue };
132-
Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
133-
134-
if (BytesLeft) {
135-
// Handle the last 1 - 7 bytes.
136-
unsigned Offset = SizeVal - BytesLeft;
137-
EVT AddrVT = Dst.getValueType();
138-
EVT SizeVT = Size.getValueType();
139-
140-
Chain =
141-
DAG.getMemset(Chain, dl,
142-
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
143-
DAG.getConstant(Offset, dl, AddrVT)),
144-
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
145-
isVolatile, AlwaysInline,
146-
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
147-
}
146+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
147+
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
148+
149+
/// RepStos can process the whole length.
150+
//
151+
// Because we changed the alignment earlier in the function to work on size
152+
// when we have the minsize attribute, this is guaranteed to be 0 when we get
153+
// here.
154+
if (BytesLeft == 0)
155+
return RepStos;
148156

149-
// TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150-
return Chain;
157+
// Handle the last 1 - 7 bytes.
158+
SmallVector<SDValue, 4> Results;
159+
Results.push_back(RepStos);
160+
unsigned Offset = SizeVal - BytesLeft;
161+
EVT AddrVT = Dst.getValueType();
162+
EVT SizeVT = Size.getValueType();
163+
164+
Results.push_back(
165+
DAG.getMemset(Chain, dl,
166+
DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
167+
DAG.getConstant(Offset, dl, AddrVT)),
168+
Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
169+
isVolatile, /* isAlwaysInline */ true,
170+
/* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)));
171+
172+
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
151173
}
152174

153175
/// Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +242,42 @@ static SDValue emitConstantSizeRepmov(
220242
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
221243
/// We assume runtime memcpy will do a better job for unaligned copies when
222244
/// ERMS is not present.
223-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
245+
if (!AlwaysInline && (Alignment < Align(4)))
224246
return SDValue();
225247

248+
// If we have minsize, then don't care about the alignment.
249+
// On x86, the CPU doesn't care and neither should you.
250+
// As long as the count is aligned, we can use the minimum number of
251+
// instructions without always having to resort to movsb
252+
//
253+
// Because this is a feature specific to x86, we must handle it here.
254+
255+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
256+
if ((Size & 15) == 0 && Subtarget.is64Bit())
257+
Alignment = Align(16);
258+
else if ((Size & 7) == 0)
259+
Alignment = Align(8);
260+
else if ((Size & 3) == 0)
261+
Alignment = Align(4);
262+
else if ((Size & 1) == 0)
263+
Alignment = Align(2);
264+
else
265+
Alignment = Align(1);
266+
}
267+
226268
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
227269
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
228270
const uint64_t BlockCount = Size / BlockBytes;
229271
const uint64_t BytesLeft = Size % BlockBytes;
272+
273+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
274+
// Use the one instruction determined. Because we changed the alignment
275+
// earlier in the function to work on size when we have the minsize
276+
// attribute, it is guaranteed to process the entire length.
277+
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
278+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
279+
}
280+
230281
SDValue RepMovs =
231282
emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src,
232283
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
@@ -237,11 +288,6 @@ static SDValue emitConstantSizeRepmov(
237288

238289
assert(BytesLeft && "We have leftover at this point");
239290

240-
/// In case we optimize for size we use repmovsb even if it's less efficient
241-
/// so we can save the loads/stores of the leftover.
242-
if (DAG.getMachineFunction().getFunction().hasMinSize())
243-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
244-
245291
// Handle the last 1 - 7 bytes.
246292
SmallVector<SDValue, 4> Results;
247293
Results.push_back(RepMovs);

0 commit comments

Comments
 (0)