Skip to content

Commit f25e932

Browse files
committed
[X86] For inline memset/memcpy with minsize, use byte or double-word accesses
repstosb and repstosd are the same size, but stosd is only done for 0 because the process of multiplying the constant so that it is copied across the bytes of the 32-bit number adds extra instructions that cause the size to increase. For 0, repstosb and repstosd are the same size, but stosd is only done for 0 because the process of multiplying the constant so that it is copied across the bytes of the 32-bit number adds extra instructions that cause the size to increase. For 0, we do not need to do that at all. For memcpy, the same goes, and as a result the minsize check was moved ahead because a jmp to memcpy encoded takes more bytes than repmovsb.
1 parent 4363cca commit f25e932

File tree

5 files changed

+197
-283
lines changed

5 files changed

+197
-283
lines changed

llvm/lib/Target/X86/X86SelectionDAGInfo.cpp

+148-100
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,23 @@ static cl::opt<bool>
2828
UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
2929
cl::desc("Use fast short rep mov in memcpy lowering"));
3030

31+
/// Returns the best type to use with repmovs depending on alignment.
32+
static MVT getOptimalRepType(const X86Subtarget &Subtarget, Align Alignment) {
33+
uint64_t Align = Alignment.value();
34+
assert((Align != 0) && "Align is normalized");
35+
assert(isPowerOf2_64(Align) && "Align is a power of 2");
36+
switch (Align) {
37+
case 1:
38+
return MVT::i8;
39+
case 2:
40+
return MVT::i16;
41+
case 4:
42+
return MVT::i32;
43+
default:
44+
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
45+
}
46+
}
47+
3148
bool X86SelectionDAGInfo::isBaseRegConflictPossible(
3249
SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
3350
// We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -44,102 +61,127 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
4461
return llvm::is_contained(ClobberSet, TRI->getBaseRegister());
4562
}
4663

47-
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
48-
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
49-
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
50-
MachinePointerInfo DstPtrInfo) const {
51-
// If to a segment-relative address space, use the default lowering.
52-
if (DstPtrInfo.getAddrSpace() >= 256)
53-
return SDValue();
64+
/// Emit a single REP STOSB instruction for a particular constant size.
65+
static SDValue emitRepstos(const X86Subtarget &Subtarget, SelectionDAG &DAG,
66+
const SDLoc &dl, SDValue Chain, SDValue Dst,
67+
SDValue Val, SDValue Size, MVT AVT) {
68+
const bool Use64BitRegs = Subtarget.isTarget64BitLP64();
69+
unsigned AX = X86::AL;
70+
switch (AVT.getSizeInBits()) {
71+
case 8:
72+
AX = X86::AL;
73+
break;
74+
case 16:
75+
AX = X86::AX;
76+
break;
77+
case 32:
78+
AX = X86::EAX;
79+
break;
80+
default:
81+
AX = X86::RAX;
82+
break;
83+
}
5484

55-
// If the base register might conflict with our physical registers, bail out.
56-
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
57-
X86::ECX, X86::EAX, X86::EDI};
58-
if (isBaseRegConflictPossible(DAG, ClobberSet))
59-
return SDValue();
85+
const unsigned CX = Use64BitRegs ? X86::RCX : X86::ECX;
86+
const unsigned DI = Use64BitRegs ? X86::RDI : X86::EDI;
6087

61-
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
62-
const X86Subtarget &Subtarget =
63-
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
88+
SDValue InGlue;
89+
Chain = DAG.getCopyToReg(Chain, dl, AX, Val, InGlue);
90+
InGlue = Chain.getValue(1);
91+
Chain = DAG.getCopyToReg(Chain, dl, CX, Size, InGlue);
92+
InGlue = Chain.getValue(1);
93+
Chain = DAG.getCopyToReg(Chain, dl, DI, Dst, InGlue);
94+
InGlue = Chain.getValue(1);
95+
96+
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
97+
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
98+
return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
99+
}
100+
101+
/// Emit a single REP STOSB instruction for a particular constant size.
102+
static SDValue emitRepstosB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
103+
const SDLoc &dl, SDValue Chain, SDValue Dst,
104+
SDValue Val, uint64_t Size) {
105+
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
106+
DAG.getIntPtrConstant(Size, dl), MVT::i8);
107+
}
108+
109+
/// Returns a REP STOS instruction, possibly with a few load/stores to implement
110+
/// a constant size memory set. In some cases where we know REP MOVS is
111+
/// inefficient we return an empty SDValue so the calling code can either
112+
/// generate a store sequence or call the runtime memset function.
113+
static SDValue emitConstantSizeRepstos(SelectionDAG &DAG,
114+
const X86Subtarget &Subtarget,
115+
const SDLoc &dl, SDValue Chain,
116+
SDValue Dst, SDValue Val, uint64_t Size,
117+
EVT SizeVT, Align Alignment,
118+
bool isVolatile, bool AlwaysInline,
119+
MachinePointerInfo DstPtrInfo) {
120+
/// In case we optimize for size, we use repstosb even if it's less efficient
121+
/// so we can save the loads/stores of the leftover.
122+
if (DAG.getMachineFunction().getFunction().hasMinSize()) {
123+
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
124+
// Special case 0 because otherwise we get large literals,
125+
// which causes larger encoding.
126+
if ((Size & 31) == 0 && (ValC->getZExtValue() & 255) == 0) {
127+
MVT BlockType = MVT::i32;
128+
const uint64_t BlockBits = BlockType.getSizeInBits();
129+
const uint64_t BlockBytes = BlockBits / 8;
130+
const uint64_t BlockCount = Size / BlockBytes;
131+
132+
Val = DAG.getConstant(0, dl, BlockType);
133+
// repstosd is same size as repstosb
134+
return emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
135+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
136+
}
137+
}
138+
return emitRepstosB(Subtarget, DAG, dl, Chain, Dst, Val, Size);
139+
}
140+
141+
if (Size > Subtarget.getMaxInlineSizeThreshold())
142+
return SDValue();
64143

65144
// If not DWORD aligned or size is more than the threshold, call the library.
66145
// The libc version is likely to be faster for these cases. It can use the
67146
// address value and run time information about the CPU.
68-
if (Alignment < Align(4) || !ConstantSize ||
69-
ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())
147+
if (Alignment < Align(4))
70148
return SDValue();
71149

72-
uint64_t SizeVal = ConstantSize->getZExtValue();
73-
SDValue InGlue;
74-
EVT AVT;
75-
SDValue Count;
76-
unsigned BytesLeft = 0;
150+
MVT BlockType = MVT::i8;
151+
uint64_t BlockCount = Size;
152+
uint64_t BytesLeft = 0;
77153
if (auto *ValC = dyn_cast<ConstantSDNode>(Val)) {
78-
unsigned ValReg;
79-
uint64_t Val = ValC->getZExtValue() & 255;
80-
81-
// If the value is a constant, then we can potentially use larger sets.
82-
if (Alignment >= Align(4)) {
83-
// DWORD aligned
84-
AVT = MVT::i32;
85-
ValReg = X86::EAX;
86-
Val = (Val << 8) | Val;
87-
Val = (Val << 16) | Val;
88-
if (Subtarget.is64Bit() && Alignment >= Align(8)) { // QWORD aligned
89-
AVT = MVT::i64;
90-
ValReg = X86::RAX;
91-
Val = (Val << 32) | Val;
92-
}
93-
} else if (Alignment == Align(2)) {
94-
// WORD aligned
95-
AVT = MVT::i16;
96-
ValReg = X86::AX;
97-
Val = (Val << 8) | Val;
98-
} else {
99-
// Byte aligned
100-
AVT = MVT::i8;
101-
ValReg = X86::AL;
102-
Count = DAG.getIntPtrConstant(SizeVal, dl);
103-
}
154+
BlockType = getOptimalRepType(Subtarget, Alignment);
155+
uint64_t Value = ValC->getZExtValue() & 255;
156+
const uint64_t BlockBits = BlockType.getSizeInBits();
104157

105-
if (AVT.bitsGT(MVT::i8)) {
106-
unsigned UBytes = AVT.getSizeInBits() / 8;
107-
Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl);
108-
BytesLeft = SizeVal % UBytes;
109-
}
158+
if (BlockBits >= 16)
159+
Value = (Value << 8) | Value;
110160

111-
Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
112-
InGlue);
113-
InGlue = Chain.getValue(1);
114-
} else {
115-
AVT = MVT::i8;
116-
Count = DAG.getIntPtrConstant(SizeVal, dl);
117-
Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InGlue);
118-
InGlue = Chain.getValue(1);
119-
}
161+
if (BlockBits >= 32)
162+
Value = (Value << 16) | Value;
120163

121-
bool Use64BitRegs = Subtarget.isTarget64BitLP64();
122-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RCX : X86::ECX,
123-
Count, InGlue);
124-
InGlue = Chain.getValue(1);
125-
Chain = DAG.getCopyToReg(Chain, dl, Use64BitRegs ? X86::RDI : X86::EDI,
126-
Dst, InGlue);
127-
InGlue = Chain.getValue(1);
164+
if (BlockBits >= 64)
165+
Value = (Value << 32) | Value;
128166

129-
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
130-
SDValue Ops[] = {Chain, DAG.getValueType(AVT), InGlue};
131-
SDValue RepStos = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
167+
const uint64_t BlockBytes = BlockBits / 8;
168+
BlockCount = Size / BlockBytes;
169+
BytesLeft = Size % BlockBytes;
170+
Val = DAG.getConstant(Value, dl, BlockType);
171+
}
132172

173+
SDValue RepStos =
174+
emitRepstos(Subtarget, DAG, dl, Chain, Dst, Val,
175+
DAG.getIntPtrConstant(BlockCount, dl), BlockType);
133176
/// RepStos can process the whole length.
134177
if (BytesLeft == 0)
135178
return RepStos;
136179

137180
// Handle the last 1 - 7 bytes.
138181
SmallVector<SDValue, 4> Results;
139182
Results.push_back(RepStos);
140-
unsigned Offset = SizeVal - BytesLeft;
183+
unsigned Offset = Size - BytesLeft;
141184
EVT AddrVT = Dst.getValueType();
142-
EVT SizeVT = Size.getValueType();
143185

144186
Results.push_back(
145187
DAG.getMemset(Chain, dl,
@@ -152,6 +194,31 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
152194
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
153195
}
154196

197+
SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
198+
SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
199+
SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
200+
MachinePointerInfo DstPtrInfo) const {
201+
// If to a segment-relative address space, use the default lowering.
202+
if (DstPtrInfo.getAddrSpace() >= 256)
203+
return SDValue();
204+
205+
// If the base register might conflict with our physical registers, bail out.
206+
const MCPhysReg ClobberSet[] = {X86::RCX, X86::RAX, X86::RDI,
207+
X86::ECX, X86::EAX, X86::EDI};
208+
if (isBaseRegConflictPossible(DAG, ClobberSet))
209+
return SDValue();
210+
211+
ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
212+
if (!ConstantSize)
213+
return SDValue();
214+
215+
const X86Subtarget &Subtarget =
216+
DAG.getMachineFunction().getSubtarget<X86Subtarget>();
217+
return emitConstantSizeRepstos(
218+
DAG, Subtarget, dl, Chain, Dst, Val, ConstantSize->getZExtValue(),
219+
Size.getValueType(), Alignment, isVolatile, AlwaysInline, DstPtrInfo);
220+
}
221+
155222
/// Emit a single REP MOVS{B,W,D,Q} instruction.
156223
static SDValue emitRepmovs(const X86Subtarget &Subtarget, SelectionDAG &DAG,
157224
const SDLoc &dl, SDValue Chain, SDValue Dst,
@@ -182,24 +249,6 @@ static SDValue emitRepmovsB(const X86Subtarget &Subtarget, SelectionDAG &DAG,
182249
DAG.getIntPtrConstant(Size, dl), MVT::i8);
183250
}
184251

185-
/// Returns the best type to use with repmovs depending on alignment.
186-
static MVT getOptimalRepmovsType(const X86Subtarget &Subtarget,
187-
Align Alignment) {
188-
uint64_t Align = Alignment.value();
189-
assert((Align != 0) && "Align is normalized");
190-
assert(isPowerOf2_64(Align) && "Align is a power of 2");
191-
switch (Align) {
192-
case 1:
193-
return MVT::i8;
194-
case 2:
195-
return MVT::i16;
196-
case 4:
197-
return MVT::i32;
198-
default:
199-
return Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
200-
}
201-
}
202-
203252
/// Returns a REP MOVS instruction, possibly with a few load/stores to implement
204253
/// a constant size memory copy. In some cases where we know REP MOVS is
205254
/// inefficient we return an empty SDValue so the calling code can either
@@ -209,6 +258,10 @@ static SDValue emitConstantSizeRepmov(
209258
SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, EVT SizeVT,
210259
Align Alignment, bool isVolatile, bool AlwaysInline,
211260
MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) {
261+
/// In case we optimize for size, we use repmovsb even if it's less efficient
262+
/// so we can save the loads/stores of the leftover.
263+
if (DAG.getMachineFunction().getFunction().hasMinSize())
264+
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
212265

213266
/// TODO: Revisit next line: big copy with ERMSB on march >= haswell are very
214267
/// efficient.
@@ -222,10 +275,10 @@ static SDValue emitConstantSizeRepmov(
222275
assert(!Subtarget.hasERMSB() && "No efficient RepMovs");
223276
/// We assume runtime memcpy will do a better job for unaligned copies when
224277
/// ERMS is not present.
225-
if (!AlwaysInline && (Alignment.value() & 3) != 0)
278+
if (!AlwaysInline && (Alignment < Align(4)))
226279
return SDValue();
227280

228-
const MVT BlockType = getOptimalRepmovsType(Subtarget, Alignment);
281+
const MVT BlockType = getOptimalRepType(Subtarget, Alignment);
229282
const uint64_t BlockBytes = BlockType.getSizeInBits() / 8;
230283
const uint64_t BlockCount = Size / BlockBytes;
231284
const uint64_t BytesLeft = Size % BlockBytes;
@@ -239,11 +292,6 @@ static SDValue emitConstantSizeRepmov(
239292

240293
assert(BytesLeft && "We have leftover at this point");
241294

242-
/// In case we optimize for size we use repmovsb even if it's less efficient
243-
/// so we can save the loads/stores of the leftover.
244-
if (DAG.getMachineFunction().getFunction().hasMinSize())
245-
return emitRepmovsB(Subtarget, DAG, dl, Chain, Dst, Src, Size);
246-
247295
// Handle the last 1 - 7 bytes.
248296
SmallVector<SDValue, 4> Results;
249297
Results.push_back(RepMovs);
@@ -282,7 +330,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
282330
if (UseFSRMForMemcpy && Subtarget.hasFSRM())
283331
return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8);
284332

285-
/// Handle constant sizes,
333+
/// Handle constant sizes
286334
if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
287335
return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
288336
ConstantSize->getZExtValue(),

llvm/test/CodeGen/X86/memcpy-struct-by-value.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
7878
; NOFAST32-NEXT: pushl %esi
7979
; NOFAST32-NEXT: subl $4100, %esp # imm = 0x1004
8080
; NOFAST32-NEXT: movl {{[0-9]+}}(%esp), %esi
81-
; NOFAST32-NEXT: movl $1024, %ecx # imm = 0x400
81+
; NOFAST32-NEXT: movl $4096, %ecx # imm = 0x1000
8282
; NOFAST32-NEXT: movl %esp, %edi
83-
; NOFAST32-NEXT: rep;movsl (%esi), %es:(%edi)
83+
; NOFAST32-NEXT: rep;movsb (%esi), %es:(%edi)
8484
; NOFAST32-NEXT: calll foo@PLT
8585
; NOFAST32-NEXT: addl $4100, %esp # imm = 0x1004
8686
; NOFAST32-NEXT: popl %esi
@@ -106,9 +106,9 @@ define void @test2(ptr nocapture %x) nounwind minsize {
106106
; NOFAST: # %bb.0:
107107
; NOFAST-NEXT: subq $4104, %rsp # imm = 0x1008
108108
; NOFAST-NEXT: movq %rdi, %rsi
109-
; NOFAST-NEXT: movl $512, %ecx # imm = 0x200
109+
; NOFAST-NEXT: movl $4096, %ecx # imm = 0x1000
110110
; NOFAST-NEXT: movq %rsp, %rdi
111-
; NOFAST-NEXT: rep;movsq (%rsi), %es:(%rdi)
111+
; NOFAST-NEXT: rep;movsb (%rsi), %es:(%rdi)
112112
; NOFAST-NEXT: callq foo@PLT
113113
; NOFAST-NEXT: addq $4104, %rsp # imm = 0x1008
114114
; NOFAST-NEXT: retq

llvm/test/CodeGen/X86/memcpy.ll

+12-8
Original file line numberDiff line numberDiff line change
@@ -202,14 +202,16 @@ define void @test3_minsize(ptr nocapture %A, ptr nocapture %B) nounwind minsize
202202
; DARWIN-LABEL: test3_minsize:
203203
; DARWIN: ## %bb.0:
204204
; DARWIN-NEXT: pushq $64
205-
; DARWIN-NEXT: popq %rdx
206-
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
205+
; DARWIN-NEXT: popq %rcx
206+
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
207+
; DARWIN-NEXT: retq
207208
;
208209
; LINUX-LABEL: test3_minsize:
209210
; LINUX: # %bb.0:
210211
; LINUX-NEXT: pushq $64
211-
; LINUX-NEXT: popq %rdx
212-
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
212+
; LINUX-NEXT: popq %rcx
213+
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
214+
; LINUX-NEXT: retq
213215
;
214216
; LINUX-SKL-LABEL: test3_minsize:
215217
; LINUX-SKL: # %bb.0:
@@ -249,14 +251,16 @@ define void @test3_minsize_optsize(ptr nocapture %A, ptr nocapture %B) nounwind
249251
; DARWIN-LABEL: test3_minsize_optsize:
250252
; DARWIN: ## %bb.0:
251253
; DARWIN-NEXT: pushq $64
252-
; DARWIN-NEXT: popq %rdx
253-
; DARWIN-NEXT: jmp _memcpy ## TAILCALL
254+
; DARWIN-NEXT: popq %rcx
255+
; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi)
256+
; DARWIN-NEXT: retq
254257
;
255258
; LINUX-LABEL: test3_minsize_optsize:
256259
; LINUX: # %bb.0:
257260
; LINUX-NEXT: pushq $64
258-
; LINUX-NEXT: popq %rdx
259-
; LINUX-NEXT: jmp memcpy@PLT # TAILCALL
261+
; LINUX-NEXT: popq %rcx
262+
; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi)
263+
; LINUX-NEXT: retq
260264
;
261265
; LINUX-SKL-LABEL: test3_minsize_optsize:
262266
; LINUX-SKL: # %bb.0:

0 commit comments

Comments
 (0)