@@ -67,10 +67,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
67
67
// The libc version is likely to be faster for these cases. It can use the
68
68
// address value and run time information about the CPU.
69
69
if (Alignment < Align (4 ) || !ConstantSize ||
70
- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
70
+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
71
71
return SDValue ();
72
72
73
+ // If we have minsize, then don't care about the alignment.
74
+ // On x86, the CPU doesn't care and neither should you.
75
+ // As long as the count is aligned, we can use the minimum number of
76
+ // instructions without always having to resort to stosb.
77
+ //
78
+ // Because this is a feature specific to x86, we must handle it here.
73
79
uint64_t SizeVal = ConstantSize->getZExtValue ();
80
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
81
+ if ((SizeVal & 7 ) == 0 && Subtarget.is64Bit ())
82
+ Alignment = Align (8 );
83
+ else if ((SizeVal & 3 ) == 0 )
84
+ Alignment = Align (4 );
85
+ else if ((SizeVal & 1 ) == 0 )
86
+ Alignment = Align (2 );
87
+ else
88
+ Alignment = Align (1 );
89
+ }
90
+
74
91
SDValue InGlue;
75
92
EVT AVT;
76
93
SDValue Count;
@@ -86,7 +103,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86
103
ValReg = X86::EAX;
87
104
Val = (Val << 8 ) | Val;
88
105
Val = (Val << 16 ) | Val;
89
- if (Subtarget.is64Bit () && Alignment > Align (8 )) { // QWORD aligned
106
+ if (Subtarget.is64Bit () && Alignment > Align (4 )) { // QWORD aligned
90
107
AVT = MVT::i64;
91
108
ValReg = X86::RAX;
92
109
Val = (Val << 32 ) | Val;
@@ -103,12 +120,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103
120
Count = DAG.getIntPtrConstant (SizeVal, dl);
104
121
}
105
122
106
- if (AVT.bitsGT (MVT::i8)) {
107
- unsigned UBytes = AVT.getSizeInBits () / 8 ;
108
- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
109
- BytesLeft = SizeVal % UBytes;
110
- }
111
-
123
+ const uint64_t BlockBytes = AVT.getSizeInBits () / 8 ;
124
+ const uint64_t BlockCount = SizeVal / BlockBytes;
125
+ Count = DAG.getIntPtrConstant (BlockCount, dl);
126
+ BytesLeft = SizeVal % BlockBytes;
112
127
Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
113
128
InGlue);
114
129
InGlue = Chain.getValue (1 );
@@ -120,34 +135,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120
135
}
121
136
122
137
bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
123
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124
- Count, InGlue);
138
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
139
+ InGlue);
125
140
InGlue = Chain.getValue (1 );
126
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127
- Dst, InGlue);
141
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
142
+ InGlue);
128
143
InGlue = Chain.getValue (1 );
129
144
130
145
SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
131
- SDValue Ops[] = { Chain, DAG.getValueType (AVT), InGlue };
132
- Chain = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133
-
134
- if (BytesLeft) {
135
- // Handle the last 1 - 7 bytes.
136
- unsigned Offset = SizeVal - BytesLeft;
137
- EVT AddrVT = Dst.getValueType ();
138
- EVT SizeVT = Size .getValueType ();
139
-
140
- Chain =
141
- DAG.getMemset (Chain, dl,
142
- DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
143
- DAG.getConstant (Offset, dl, AddrVT)),
144
- Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
145
- isVolatile, AlwaysInline,
146
- /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset));
147
- }
146
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
147
+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
148
+
149
+ // / RepStos can process the whole length.
150
+ //
151
+ // Because we changed the alignment earlier in the function to work on size
152
+ // when we have the minsize attribute, this is guaranteed to be 0 when we get
153
+ // here.
154
+ if (BytesLeft == 0 )
155
+ return RepStos;
148
156
149
- // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150
- return Chain;
157
+ // Handle the last 1 - 7 bytes.
158
+ SmallVector<SDValue, 4 > Results;
159
+ Results.push_back (RepStos);
160
+ unsigned Offset = SizeVal - BytesLeft;
161
+ EVT AddrVT = Dst.getValueType ();
162
+ EVT SizeVT = Size .getValueType ();
163
+
164
+ Results.push_back (
165
+ DAG.getMemset (Chain, dl,
166
+ DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
167
+ DAG.getConstant (Offset, dl, AddrVT)),
168
+ Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
169
+ isVolatile, /* isAlwaysInline */ true ,
170
+ /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset)));
171
+
172
+ return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
151
173
}
152
174
153
175
// / Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +242,42 @@ static SDValue emitConstantSizeRepmov(
220
242
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
221
243
// / We assume runtime memcpy will do a better job for unaligned copies when
222
244
// / ERMS is not present.
223
- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
245
+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
224
246
return SDValue ();
225
247
248
+ // If we have minsize, then don't care about the alignment.
249
+ // On x86, the CPU doesn't care and neither should you.
250
+ // As long as the count is aligned, we can use the minimum number of
251
+ // instructions without always having to resort to movsb
252
+ //
253
+ // Because this is a feature specific to x86, we must handle it here.
254
+
255
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
256
+ if ((Size & 15 ) == 0 && Subtarget.is64Bit ())
257
+ Alignment = Align (16 );
258
+ else if ((Size & 7 ) == 0 )
259
+ Alignment = Align (8 );
260
+ else if ((Size & 3 ) == 0 )
261
+ Alignment = Align (4 );
262
+ else if ((Size & 1 ) == 0 )
263
+ Alignment = Align (2 );
264
+ else
265
+ Alignment = Align (1 );
266
+ }
267
+
226
268
const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
227
269
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
228
270
const uint64_t BlockCount = Size / BlockBytes;
229
271
const uint64_t BytesLeft = Size % BlockBytes;
272
+
273
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
274
+ // Use the one instruction determined. Because we changed the alignment
275
+ // earlier in the function to work on size when we have the minsize
276
+ // attribute, it is guaranteed to process the entire length.
277
+ return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
278
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
279
+ }
280
+
230
281
SDValue RepMovs =
231
282
emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
232
283
DAG.getIntPtrConstant (BlockCount, dl), BlockType);
@@ -237,11 +288,6 @@ static SDValue emitConstantSizeRepmov(
237
288
238
289
assert (BytesLeft && " We have leftover at this point" );
239
290
240
- // / In case we optimize for size we use repmovsb even if it's less efficient
241
- // / so we can save the loads/stores of the leftover.
242
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
243
- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
244
-
245
291
// Handle the last 1 - 7 bytes.
246
292
SmallVector<SDValue, 4 > Results;
247
293
Results.push_back (RepMovs);
0 commit comments