@@ -28,6 +28,23 @@ static cl::opt<bool>
28
28
UseFSRMForMemcpy (" x86-use-fsrm-for-memcpy" , cl::Hidden, cl::init(false ),
29
29
cl::desc(" Use fast short rep mov in memcpy lowering" ));
30
30
31
+ // / Returns the best type to use with repmovs depending on alignment.
32
+ static MVT getOptimalRepType (const X86Subtarget &Subtarget, Align Alignment) {
33
+ uint64_t Align = Alignment.value ();
34
+ assert ((Align != 0 ) && " Align is normalized" );
35
+ assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
36
+ switch (Align) {
37
+ case 1 :
38
+ return MVT::i8;
39
+ case 2 :
40
+ return MVT::i16;
41
+ case 4 :
42
+ return MVT::i32;
43
+ default :
44
+ return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
45
+ }
46
+ }
47
+
31
48
bool X86SelectionDAGInfo::isBaseRegConflictPossible (
32
49
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
33
50
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,102 +61,127 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
44
61
return llvm::is_contained (ClobberSet, TRI->getBaseRegister ());
45
62
}
46
63
47
- SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
48
- SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49
- SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
50
- MachinePointerInfo DstPtrInfo) const {
51
- // If to a segment-relative address space, use the default lowering.
52
- if (DstPtrInfo.getAddrSpace () >= 256 )
53
- return SDValue ();
64
+ // / Emit a single REP STOSB instruction for a particular constant size.
65
+ static SDValue emitRepstos (const X86Subtarget &Subtarget, SelectionDAG &DAG,
66
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
67
+ SDValue Val, SDValue Size , MVT AVT) {
68
+ const bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
69
+ unsigned AX = X86::AL;
70
+ switch (AVT.getSizeInBits ()) {
71
+ case 8 :
72
+ AX = X86::AL;
73
+ break ;
74
+ case 16 :
75
+ AX = X86::AX;
76
+ break ;
77
+ case 32 :
78
+ AX = X86::EAX;
79
+ break ;
80
+ default :
81
+ AX = X86::RAX;
82
+ break ;
83
+ }
54
84
55
- // If the base register might conflict with our physical registers, bail out.
56
- const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57
- X86::ECX, X86::EAX, X86::EDI};
58
- if (isBaseRegConflictPossible (DAG, ClobberSet))
59
- return SDValue ();
85
+ const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86
+ const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
60
87
61
- ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
62
- const X86Subtarget &Subtarget =
63
- DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
88
+ SDValue InGlue;
89
+ Chain = DAG.getCopyToReg (Chain, dl, AX, Val, InGlue);
90
+ InGlue = Chain.getValue (1 );
91
+ Chain = DAG.getCopyToReg (Chain, dl, CX, Size , InGlue);
92
+ InGlue = Chain.getValue (1 );
93
+ Chain = DAG.getCopyToReg (Chain, dl, DI, Dst, InGlue);
94
+ InGlue = Chain.getValue (1 );
95
+
96
+ SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
97
+ SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
98
+ return DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
99
+ }
100
+
101
+ // / Emit a single REP STOSB instruction for a particular constant size.
102
+ static SDValue emitRepstosB (const X86Subtarget &Subtarget, SelectionDAG &DAG,
103
+ const SDLoc &dl, SDValue Chain, SDValue Dst,
104
+ SDValue Val, uint64_t Size ) {
105
+ return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
106
+ DAG.getIntPtrConstant (Size , dl), MVT::i8);
107
+ }
108
+
109
+ // / Returns a REP STOS instruction, possibly with a few load/stores to implement
110
+ // / a constant size memory set. In some cases where we know REP MOVS is
111
+ // / inefficient we return an empty SDValue so the calling code can either
112
+ // / generate a store sequence or call the runtime memset function.
113
+ static SDValue emitConstantSizeRepstos (SelectionDAG &DAG,
114
+ const X86Subtarget &Subtarget,
115
+ const SDLoc &dl, SDValue Chain,
116
+ SDValue Dst, SDValue Val, uint64_t Size ,
117
+ EVT SizeVT, Align Alignment,
118
+ bool isVolatile, bool AlwaysInline,
119
+ MachinePointerInfo DstPtrInfo) {
120
+ // / In case we optimize for size, we use repstosb even if it's less efficient
121
+ // / so we can save the loads/stores of the leftover.
122
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ()) {
123
+ if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124
+ // Special case 0 because otherwise we get large literals,
125
+ // which causes larger encoding.
126
+ if ((Size & 31 ) == 0 && (ValC->getZExtValue () & 255 ) == 0 ) {
127
+ MVT BlockType = MVT::i32;
128
+ const uint64_t BlockBits = BlockType.getSizeInBits ();
129
+ const uint64_t BlockBytes = BlockBits / 8 ;
130
+ const uint64_t BlockCount = Size / BlockBytes;
131
+
132
+ Val = DAG.getConstant (0 , dl, BlockType);
133
+ // repstosd is same size as repstosb
134
+ return emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
135
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
136
+ }
137
+ }
138
+ return emitRepstosB (Subtarget, DAG, dl, Chain, Dst, Val, Size );
139
+ }
140
+
141
+ if (Size > Subtarget.getMaxInlineSizeThreshold ())
142
+ return SDValue ();
64
143
65
144
// If not DWORD aligned or size is more than the threshold, call the library.
66
145
// The libc version is likely to be faster for these cases. It can use the
67
146
// address value and run time information about the CPU.
68
- if (Alignment < Align (4 ) || !ConstantSize ||
69
- ConstantSize->getZExtValue () > Subtarget.getMaxInlineSizeThreshold ())
147
+ if (Alignment < Align (4 ))
70
148
return SDValue ();
71
149
72
- uint64_t SizeVal = ConstantSize->getZExtValue ();
73
- SDValue InGlue;
74
- EVT AVT;
75
- SDValue Count;
76
- unsigned BytesLeft = 0 ;
150
+ MVT BlockType = MVT::i8;
151
+ uint64_t BlockCount = Size ;
152
+ uint64_t BytesLeft = 0 ;
77
153
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78
- unsigned ValReg;
79
- uint64_t Val = ValC->getZExtValue () & 255 ;
80
-
81
- // If the value is a constant, then we can potentially use larger sets.
82
- if (Alignment >= Align (4 )) {
83
- // DWORD aligned
84
- AVT = MVT::i32;
85
- ValReg = X86::EAX;
86
- Val = (Val << 8 ) | Val;
87
- Val = (Val << 16 ) | Val;
88
- if (Subtarget.is64Bit () && Alignment >= Align (8 )) { // QWORD aligned
89
- AVT = MVT::i64;
90
- ValReg = X86::RAX;
91
- Val = (Val << 32 ) | Val;
92
- }
93
- } else if (Alignment == Align (2 )) {
94
- // WORD aligned
95
- AVT = MVT::i16;
96
- ValReg = X86::AX;
97
- Val = (Val << 8 ) | Val;
98
- } else {
99
- // Byte aligned
100
- AVT = MVT::i8;
101
- ValReg = X86::AL;
102
- Count = DAG.getIntPtrConstant (SizeVal, dl);
103
- }
154
+ BlockType = getOptimalRepType (Subtarget, Alignment);
155
+ uint64_t Value = ValC->getZExtValue () & 255 ;
156
+ const uint64_t BlockBits = BlockType.getSizeInBits ();
104
157
105
- if (AVT.bitsGT (MVT::i8)) {
106
- unsigned UBytes = AVT.getSizeInBits () / 8 ;
107
- Count = DAG.getIntPtrConstant (SizeVal / UBytes, dl);
108
- BytesLeft = SizeVal % UBytes;
109
- }
158
+ if (BlockBits >= 16 )
159
+ Value = (Value << 8 ) | Value;
110
160
111
- Chain = DAG.getCopyToReg (Chain, dl, ValReg, DAG.getConstant (Val, dl, AVT),
112
- InGlue);
113
- InGlue = Chain.getValue (1 );
114
- } else {
115
- AVT = MVT::i8;
116
- Count = DAG.getIntPtrConstant (SizeVal, dl);
117
- Chain = DAG.getCopyToReg (Chain, dl, X86::AL, Val, InGlue);
118
- InGlue = Chain.getValue (1 );
119
- }
161
+ if (BlockBits >= 32 )
162
+ Value = (Value << 16 ) | Value;
120
163
121
- bool Use64BitRegs = Subtarget.isTarget64BitLP64 ();
122
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123
- Count, InGlue);
124
- InGlue = Chain.getValue (1 );
125
- Chain = DAG.getCopyToReg (Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126
- Dst, InGlue);
127
- InGlue = Chain.getValue (1 );
164
+ if (BlockBits >= 64 )
165
+ Value = (Value << 32 ) | Value;
128
166
129
- SDVTList Tys = DAG.getVTList (MVT::Other, MVT::Glue);
130
- SDValue Ops[] = {Chain, DAG.getValueType (AVT), InGlue};
131
- SDValue RepStos = DAG.getNode (X86ISD::REP_STOS, dl, Tys, Ops);
167
+ const uint64_t BlockBytes = BlockBits / 8 ;
168
+ BlockCount = Size / BlockBytes;
169
+ BytesLeft = Size % BlockBytes;
170
+ Val = DAG.getConstant (Value, dl, BlockType);
171
+ }
132
172
173
+ SDValue RepStos =
174
+ emitRepstos (Subtarget, DAG, dl, Chain, Dst, Val,
175
+ DAG.getIntPtrConstant (BlockCount, dl), BlockType);
133
176
// / RepStos can process the whole length.
134
177
if (BytesLeft == 0 )
135
178
return RepStos;
136
179
137
180
// Handle the last 1 - 7 bytes.
138
181
SmallVector<SDValue, 4 > Results;
139
182
Results.push_back (RepStos);
140
- unsigned Offset = SizeVal - BytesLeft;
183
+ unsigned Offset = Size - BytesLeft;
141
184
EVT AddrVT = Dst.getValueType ();
142
- EVT SizeVT = Size .getValueType ();
143
185
144
186
Results.push_back (
145
187
DAG.getMemset (Chain, dl,
@@ -152,6 +194,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
152
194
return DAG.getNode (ISD::TokenFactor, dl, MVT::Other, Results);
153
195
}
154
196
197
+ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset (
198
+ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
199
+ SDValue Size , Align Alignment, bool isVolatile, bool AlwaysInline,
200
+ MachinePointerInfo DstPtrInfo) const {
201
+ // If to a segment-relative address space, use the default lowering.
202
+ if (DstPtrInfo.getAddrSpace () >= 256 )
203
+ return SDValue ();
204
+
205
+ // If the base register might conflict with our physical registers, bail out.
206
+ const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
207
+ X86::ECX, X86::EAX, X86::EDI};
208
+ if (isBaseRegConflictPossible (DAG, ClobberSet))
209
+ return SDValue ();
210
+
211
+ ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size );
212
+ if (!ConstantSize)
213
+ return SDValue ();
214
+
215
+ const X86Subtarget &Subtarget =
216
+ DAG.getMachineFunction ().getSubtarget <X86Subtarget>();
217
+ return emitConstantSizeRepstos (
218
+ DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue (),
219
+ Size .getValueType (), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
220
+ }
221
+
155
222
// / Emit a single REP MOVS{B,W,D,Q} instruction.
156
223
static SDValue emitRepmovs (const X86Subtarget &Subtarget, SelectionDAG &DAG,
157
224
const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +249,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
182
249
DAG.getIntPtrConstant (Size , dl), MVT::i8);
183
250
}
184
251
185
- // / Returns the best type to use with repmovs depending on alignment.
186
- static MVT getOptimalRepmovsType (const X86Subtarget &Subtarget,
187
- Align Alignment) {
188
- uint64_t Align = Alignment.value ();
189
- assert ((Align != 0 ) && " Align is normalized" );
190
- assert (isPowerOf2_64 (Align) && " Align is a power of 2" );
191
- switch (Align) {
192
- case 1 :
193
- return MVT::i8;
194
- case 2 :
195
- return MVT::i16;
196
- case 4 :
197
- return MVT::i32;
198
- default :
199
- return Subtarget.is64Bit () ? MVT::i64 : MVT::i32;
200
- }
201
- }
202
-
203
252
// / Returns a REP MOVS instruction, possibly with a few load/stores to implement
204
253
// / a constant size memory copy. In some cases where we know REP MOVS is
205
254
// / inefficient we return an empty SDValue so the calling code can either
@@ -209,6 +258,10 @@ static SDValue emitConstantSizeRepmov(
209
258
SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size , EVT SizeVT,
210
259
Align Alignment, bool isVolatile, bool AlwaysInline,
211
260
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261
+ // / In case we optimize for size, we use repmovsb even if it's less efficient
262
+ // / so we can save the loads/stores of the leftover.
263
+ if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
264
+ return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
212
265
213
266
// / TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
214
267
// / efficient.
@@ -222,10 +275,10 @@ static SDValue emitConstantSizeRepmov(
222
275
assert (!Subtarget.hasERMSB () && " No efficient RepMovs" );
223
276
// / We assume runtime memcpy will do a better job for unaligned copies when
224
277
// / ERMS is not present.
225
- if (!AlwaysInline && (Alignment. value () & 3 ) != 0 )
278
+ if (!AlwaysInline && (Alignment < Align ( 4 )) )
226
279
return SDValue ();
227
280
228
- const MVT BlockType = getOptimalRepmovsType (Subtarget, Alignment);
281
+ const MVT BlockType = getOptimalRepType (Subtarget, Alignment);
229
282
const uint64_t BlockBytes = BlockType.getSizeInBits () / 8 ;
230
283
const uint64_t BlockCount = Size / BlockBytes;
231
284
const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,11 +292,6 @@ static SDValue emitConstantSizeRepmov(
239
292
240
293
assert (BytesLeft && " We have leftover at this point" );
241
294
242
- // / In case we optimize for size we use repmovsb even if it's less efficient
243
- // / so we can save the loads/stores of the leftover.
244
- if (DAG.getMachineFunction ().getFunction ().hasMinSize ())
245
- return emitRepmovsB (Subtarget, DAG, dl, Chain, Dst, Src, Size );
246
-
247
295
// Handle the last 1 - 7 bytes.
248
296
SmallVector<SDValue, 4 > Results;
249
297
Results.push_back (RepMovs);
@@ -282,7 +330,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
282
330
if (UseFSRMForMemcpy && Subtarget.hasFSRM ())
283
331
return emitRepmovs (Subtarget, DAG, dl, Chain, Dst, Src, Size , MVT::i8);
284
332
285
- // / Handle constant sizes,
333
+ // / Handle constant sizes
286
334
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size ))
287
335
return emitConstantSizeRepmov (DAG, Subtarget, dl, Chain, Dst, Src,
288
336
ConstantSize->getZExtValue (),
0 commit comments