@@ -66,11 +66,23 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
66
66
// If not DWORD aligned or size is more than the threshold, call the library.
67
67
// The libc version is likely to be faster for these cases. It can use the
68
68
// address value and run time information about the CPU.
69
- if (Alignment < Align (4 ) || !ConstantSize ||
70
- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
69
+ if (!ConstantSize)
71
70
return SDValue ();
72
71
73
72
uint64_t SizeVal = ConstantSize->getZExtValue ();
73
+ if (!AlwaysInline &&
74
+ (Alignment < Align (4 ) || SizeVal > Subtarget.getMaxInlineSizeThreshold ()))
75
+ return SDValue ();
76
+
77
+ // If we have minsize, then don't care about the alignment.
78
+ // On x86, the CPU doesn't care and neither should you.
79
+ // As long as the count is aligned, we can use the minimum number of
80
+ // instructions without always having to resort to stosb.
81
+ //
82
+ // Because this is a feature specific to x86, we must handle it here.
83
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
84
+ Alignment = commonAlignment (Align (Subtarget.is64Bit () ? 8 : 4 ), SizeVal);
85
+
74
86
SDValue InGlue;
75
87
EVT AVT;
76
88
SDValue Count;
@@ -80,13 +92,13 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
80
92
uint64_t Val = ValC->getZExtValue () & 255 ;
81
93
82
94
// If the value is a constant, then we can potentially use larger sets.
83
- if (Alignment > Align (2 )) {
95
+ if (Alignment >= Align (4 )) {
84
96
// DWORD aligned
85
97
AVT = MVT::i32;
86
98
ValReg = X86::EAX;
87
99
Val = (Val << 8 ) | Val;
88
100
Val = (Val << 16 ) | Val;
89
- if (Subtarget.is64Bit () && Alignment > Align (8 )) { // QWORD aligned
101
+ if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
90
102
AVT = MVT::i64;
91
103
ValReg = X86::RAX;
92
104
Val = (Val << 32 ) | Val;
@@ -103,12 +115,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
103
115
Count = DAG.getIntPtrConstant (SizeVal, dl);
104
116
}
105
117
106
- if (AVT.bitsGT (MVT::i8)) {
107
- unsigned UBytes = AVT.getSizeInBits () / 8 ;
108
- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
109
- BytesLeft = SizeVal % UBytes;
110
- }
111
-
118
+ const uint64_t BlockBytes = AVT.getSizeInBits () / 8 ;
119
+ const uint64_t BlockCount = SizeVal / BlockBytes;
120
+ Count = DAG.getIntPtrConstant (BlockCount, dl);
121
+ BytesLeft = SizeVal % BlockBytes;
112
122
Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
113
123
InGlue);
114
124
InGlue = Chain.getValue (1 );
@@ -120,34 +130,41 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
120
130
}
121
131
122
132
bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
123
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
124
- Count, InGlue);
133
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX, Count,
134
+ InGlue);
125
135
InGlue = Chain.getValue (1 );
126
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
127
- Dst, InGlue);
136
+ Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI, Dst,
137
+ InGlue);
128
138
InGlue = Chain.getValue (1 );
129
139
130
140
SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
131
- SDValue Ops[] = { Chain, DAG.getValueType (AVT), InGlue };
132
- Chain = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
133
-
134
- if (BytesLeft) {
135
- // Handle the last 1 - 7 bytes.
136
- unsigned Offset = SizeVal - BytesLeft;
137
- EVT AddrVT = Dst.getValueType ();
138
- EVT SizeVT = Size .getValueType ();
139
-
140
- Chain =
141
- DAG.getMemset (Chain, dl,
142
- DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
143
- DAG.getConstant (Offset, dl, AddrVT)),
144
- Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
145
- isVolatile, AlwaysInline,
146
- /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset));
147
- }
141
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
142
+ SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
143
+
144
+ // / RepStos can process the whole length.
145
+ //
146
+ // Because we changed the alignment earlier in the function to work on size
147
+ // when we have the minsize attribute, this is guaranteed to be 0 when we get
148
+ // here.
149
+ if (BytesLeft == 0 )
150
+ return RepStos;
148
151
149
- // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
150
- return Chain;
152
+ // Handle the last 1 - 7 bytes.
153
+ SmallVector<SDValue, 4 > Results;
154
+ Results.push_back (RepStos);
155
+ unsigned Offset = SizeVal - BytesLeft;
156
+ EVT AddrVT = Dst.getValueType ();
157
+ EVT SizeVT = Size .getValueType ();
158
+
159
+ Results.push_back (
160
+ DAG.getMemset (Chain, dl,
161
+ DAG.getNode (ISD::ADD, dl, AddrVT, Dst,
162
+ DAG.getConstant (Offset, dl, AddrVT)),
163
+ Val, DAG.getConstant (BytesLeft, dl, SizeVT), Alignment,
164
+ isVolatile, /* AlwaysInline */ true ,
165
+ /* isTailCall */ false , DstPtrInfo.getWithOffset (Offset)));
166
+
167
+ return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
151
168
}
152
169
153
170
// / Emit a single REP MOVS{B,W,D,Q} instruction.
@@ -220,13 +237,32 @@ static SDValue emitConstantSizeRepmov(
220
237
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
221
238
// / We assume runtime memcpy will do a better job for unaligned copies when
222
239
// / ERMS is not present.
223
- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
240
+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
224
241
return SDValue ();
225
242
243
+ // If we have minsize, then don't care about the alignment.
244
+ // On x86, the CPU doesn't care and neither should you.
245
+ // As long as the count is aligned, we can use the minimum number of
246
+ // instructions without always having to resort to movsb
247
+ //
248
+ // Because this is a feature specific to x86, we must handle it here.
249
+
250
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
251
+ Alignment = commonAlignment (Align (Subtarget.is64Bit () ? 8 : 4 ), Size );
252
+
226
253
const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
227
254
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
228
255
const uint64_t BlockCount = Size / BlockBytes;
229
256
const uint64_t BytesLeft = Size % BlockBytes;
257
+
258
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
259
+ // Use the one instruction determined. Because we changed the alignment
260
+ // earlier in the function to work on size when we have the minsize
261
+ // attribute, it is guaranteed to process the entire length.
262
+ return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
263
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
264
+ }
265
+
230
266
SDValue RepMovs =
231
267
emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src,
232
268
DAG.getIntPtrConstant (BlockCount, dl), BlockType);
@@ -237,11 +273,6 @@ static SDValue emitConstantSizeRepmov(
237
273
238
274
assert (BytesLeft && " We have leftover at this point" );
239
275
240
- // / In case we optimize for size we use repmovsb even if it's less efficient
241
- // / so we can save the loads/stores of the leftover.
242
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
243
- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
244
-
245
276
// Handle the last 1 - 7 bytes.
246
277
SmallVector<SDValue, 4 > Results;
247
278
Results.push_back (RepMovs);
0 commit comments