@@ -66,11 +66,30 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
66
66
// If not DWORD aligned or size is more than the threshold, call the library.
67
67
// The libc version is likely to be faster for these cases. It can use the
68
68
// address value and run time information about the CPU.
69
- if (Alignment < Align (4 ) || !ConstantSize ||
70
- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
69
+ if (!ConstantSize ||
70
+ (!AlwaysInline &&
71
+ (Alignment < Align (4 ) ||
72
+ ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())))
71
73
return SDValue ();
72
74
75
+ // If we have minsize, then don't care about the alignment.
76
+ // On x86, the CPU doesn't care and neither should you.
77
+ // As long as the count is aligned, we can use the minimum number of
78
+ // instructions without always having to resort to stosb.
79
+ //
80
+ // Because this is a feature specific to x86, we must handle it here.
73
81
uint64_t SizeVal = ConstantSize->getZExtValue ();
82
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
83
+ if ((SizeVal & 7 ) == 0 && Subtarget.is64Bit ())
84
+ Alignment = Align (8 );
85
+ else if ((SizeVal & 3 ) == 0 )
86
+ Alignment = Align (4 );
87
+ else if ((SizeVal & 1 ) == 0 )
88
+ Alignment = Align (2 );
89
+ else
90
+ Alignment = Align (1 );
91
+ }
92
+
74
93
SDValue InGlue;
75
94
EVT AVT;
76
95
SDValue Count;
@@ -86,7 +105,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
86
105
ValReg = X86::EAX;
87
106
Val = (Val << 8 ) | Val;
88
107
Val = (Val << 16 ) | Val;
89
- if (Subtarget.is64Bit () && Alignment > Align (8 )) { // QWORD aligned
108
+ if (Subtarget.is64Bit () && Alignment > Align (4 )) { // QWORD aligned
90
109
AVT = MVT::i64;
91
110
ValReg = X86::RAX;
92
111
Val = (Val << 32 ) | Val;
@@ -103,12 +122,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103
122
Count = DAG.getIntPtrConstant (SizeVal, dl);
104
123
}
105
124
106
- if (AVT.bitsGT (MVT::i8)) {
107
- unsigned UBytes = AVT.getSizeInBits () / 8 ;
108
- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
109
- BytesLeft = SizeVal % UBytes;
110
- }
111
-
125
+ const uint64_t BlockBytes = AVT.getSizeInBits () / 8 ;
126
+ const uint64_t BlockCount = SizeVal / BlockBytes;
127
+ Count = DAG.getIntPtrConstant (BlockCount, dl);
128
+ BytesLeft = SizeVal % BlockBytes;
112
129
Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
113
130
InGlue);
114
131
InGlue = Chain.getValue (1 );
@@ -120,34 +137,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120
137
}
121
138
122
139
bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
123
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124
- Count, InGlue);
140
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
141
+ InGlue);
125
142
InGlue = Chain.getValue (1 );
126
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127
- Dst, InGlue);
143
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
144
+ InGlue);
128
145
InGlue = Chain.getValue (1 );
129
146
130
147
SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
131
- SDValue Ops[] = { Chain, DAG.getValueType (AVT), InGlue };
132
- Chain = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133
-
134
- if (BytesLeft) {
135
- // Handle the last 1 - 7 bytes.
136
- unsigned Offset = SizeVal - BytesLeft;
137
- EVT AddrVT = Dst.getValueType ();
138
- EVT SizeVT = Size .getValueType ();
139
-
140
- Chain =
141
- DAG.getMemset (Chain, dl,
142
- DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
143
- DAG.getConstant (Offset, dl, AddrVT)),
144
- Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
145
- isVolatile, AlwaysInline,
146
- /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset));
147
- }
148
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
149
+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
150
+
151
+ // / RepStos can process the whole length.
152
+ //
153
+ // Because we changed the alignment earlier in the function to work on size
154
+ // when we have the minsize attribute, this is guaranteed to be 0 when we get
155
+ // here.
156
+ if (BytesLeft == 0 )
157
+ return RepStos;
148
158
149
- // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150
- return Chain;
159
+ // Handle the last 1 - 7 bytes.
160
+ SmallVector<SDValue, 4 > Results;
161
+ Results.push_back (RepStos);
162
+ unsigned Offset = SizeVal - BytesLeft;
163
+ EVT AddrVT = Dst.getValueType ();
164
+ EVT SizeVT = Size .getValueType ();
165
+
166
+ Results.push_back (
167
+ DAG.getMemset (Chain, dl,
168
+ DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
169
+ DAG.getConstant (Offset, dl, AddrVT)),
170
+ Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
171
+ isVolatile, /* AlwaysInline */ true ,
172
+ /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset)));
173
+
174
+ return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
151
175
}
152
176
153
177
// / Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +244,42 @@ static SDValue emitConstantSizeRepmov(
220
244
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
221
245
// / We assume runtime memcpy will do a better job for unaligned copies when
222
246
// / ERMS is not present.
223
- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
247
+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
224
248
return SDValue ();
225
249
250
+ // If we have minsize, then don't care about the alignment.
251
+ // On x86, the CPU doesn't care and neither should you.
252
+ // As long as the count is aligned, we can use the minimum number of
253
+ // instructions without always having to resort to movsb
254
+ //
255
+ // Because this is a feature specific to x86, we must handle it here.
256
+
257
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
258
+ if ((Size & 15 ) == 0 && Subtarget.is64Bit ())
259
+ Alignment = Align (16 );
260
+ else if ((Size & 7 ) == 0 )
261
+ Alignment = Align (8 );
262
+ else if ((Size & 3 ) == 0 )
263
+ Alignment = Align (4 );
264
+ else if ((Size & 1 ) == 0 )
265
+ Alignment = Align (2 );
266
+ else
267
+ Alignment = Align (1 );
268
+ }
269
+
226
270
const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
227
271
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
228
272
const uint64_t BlockCount = Size / BlockBytes;
229
273
const uint64_t BytesLeft = Size % BlockBytes;
274
+
275
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
276
+ // Use the one instruction determined. Because we changed the alignment
277
+ // earlier in the function to work on size when we have the minsize
278
+ // attribute, it is guaranteed to process the entire length.
279
+ return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
280
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
281
+ }
282
+
230
283
SDValue RepMovs =
231
284
emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
232
285
DAG.getIntPtrConstant (BlockCount, dl), BlockType);
@@ -237,11 +290,6 @@ static SDValue emitConstantSizeRepmov(
237
290
238
291
assert (BytesLeft && " We have leftover at this point" );
239
292
240
- // / In case we optimize for size we use repmovsb even if it's less efficient
241
- // / so we can save the loads/stores of the leftover.
242
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
243
- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
244
-
245
293
// Handle the last 1 - 7 bytes.
246
294
SmallVector<SDValue, 4 > Results;
247
295
Results.push_back (RepMovs);
0 commit comments