Skip to content

Commit 928eb10

Browse files
committed
[ARM] Optimise byval arguments in tail-calls
We don't need to copy byval arguments to tail calls via a temporary, if we can prove that we are not copying from the outgoing argument area. This patch does this when the source if the argument is one of: * Memory in the local stack frame, which can't be used for tail-call arguments. * A global variable. We can also avoid doing the copy completely if the source and destination are the same memory location, which is the case when the caller and callee have the same signature, and pass some arguments through unmodified.
1 parent e504e79 commit 928eb10

File tree

3 files changed

+207
-106
lines changed

3 files changed

+207
-106
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,6 +2325,59 @@ std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
23252325
return std::make_pair(DstAddr, DstInfo);
23262326
}
23272327

2328+
// Returns the type of copying which is required to set up a byval argument to
2329+
// a tail-called function. This isn't needed for non-tail calls, because they
2330+
// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
2331+
// avoid clobbering another argument (CopyViaTemp), and sometimes can be
2332+
// optimised to zero copies when forwarding an argument from the caller's
2333+
// caller (NoCopy).
2334+
ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
2335+
SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
2336+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2337+
ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
2338+
2339+
// Globals are always safe to copy from.
2340+
if (isa<GlobalAddressSDNode>(Src) || isa<ExternalSymbolSDNode>(Src))
2341+
return CopyOnce;
2342+
2343+
// Can only analyse frame index nodes, conservatively assume we need a
2344+
// temporary.
2345+
auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
2346+
auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
2347+
if (!SrcFrameIdxNode || !DstFrameIdxNode)
2348+
return CopyViaTemp;
2349+
2350+
int SrcFI = SrcFrameIdxNode->getIndex();
2351+
int DstFI = DstFrameIdxNode->getIndex();
2352+
assert(MFI.isFixedObjectIndex(DstFI) &&
2353+
"byval passed in non-fixed stack slot");
2354+
2355+
int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2356+
int64_t DstOffset = MFI.getObjectOffset(DstFI);
2357+
2358+
// If the source is in the local frame, then the copy to the argument memory
2359+
// is always valid.
2360+
bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2361+
if (!FixedSrc ||
2362+
(FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2363+
return CopyOnce;
2364+
2365+
// In the case of byval arguments split between registers and the stack,
2366+
// computeAddrForCallArg returns a FrameIndex which corresponds only to the
2367+
// stack portion, but the Src SDValue will refer to the full value, including
2368+
// the local stack memory that the register portion gets stored into. We only
2369+
// need to compare them for equality, so normalise on the full value version.
2370+
uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2371+
DstOffset -= RegSize;
2372+
2373+
// If the value is already in the correct location, then no copying is
2374+
// needed. If not, then we need to copy via a temporary.
2375+
if (SrcOffset == DstOffset)
2376+
return NoCopy;
2377+
else
2378+
return CopyViaTemp;
2379+
}
2380+
23282381
void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
23292382
SDValue Chain, SDValue &Arg,
23302383
RegsToPassVector &RegsToPass,
@@ -2499,37 +2552,58 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
24992552
// overwritten by the stores of the outgoing arguments. To avoid this, we
25002553
// need to make a temporary copy of them in local stack space, then copy back
25012554
// to the argument area.
2502-
// TODO This could be optimised to skip byvals which are already being copied
2503-
// from local stack space, or which are copied from the incoming stack at the
2504-
// exact same location.
25052555
DenseMap<unsigned, SDValue> ByValTemporaries;
25062556
SDValue ByValTempChain;
25072557
if (isTailCall) {
2508-
for (unsigned ArgIdx = 0, e = OutVals.size(); ArgIdx != e; ++ArgIdx) {
2509-
SDValue Arg = OutVals[ArgIdx];
2558+
SmallVector<SDValue, 8> ByValCopyChains;
2559+
for (const CCValAssign &VA : ArgLocs) {
2560+
unsigned ArgIdx = VA.getValNo();
2561+
SDValue Src = OutVals[ArgIdx];
25102562
ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
25112563

2512-
if (Flags.isByVal()) {
2513-
int FrameIdx = MFI.CreateStackObject(
2564+
if (!Flags.isByVal())
2565+
continue;
2566+
2567+
SDValue Dst;
2568+
MachinePointerInfo DstInfo;
2569+
std::tie(Dst, DstInfo) =
2570+
computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2571+
ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2572+
2573+
if (Copy == NoCopy) {
2574+
// If the argument is already at the correct offset on the stack
2575+
// (because we are forwarding a byval argument from our caller), we
2576+
// don't need any copying.
2577+
continue;
2578+
} else if (Copy == CopyOnce) {
2579+
// If the argument is in our local stack frame, no other argument
2580+
// preparation can clobber it, so we can copy it to the final location
2581+
// later.
2582+
ByValTemporaries[ArgIdx] = Src;
2583+
} else {
2584+
assert(Copy == CopyViaTemp && "unexpected enum value");
2585+
// If we might be copying this argument from the outgoing argument
2586+
// stack area, we need to copy via a temporary in the local stack
2587+
// frame.
2588+
int TempFrameIdx = MFI.CreateStackObject(
25142589
Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2515-
SDValue Dst =
2516-
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2590+
SDValue Temp =
2591+
DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
25172592

25182593
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
25192594
SDValue AlignNode =
25202595
DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
25212596

25222597
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2523-
SDValue Ops[] = { Chain, Dst, Arg, SizeNode, AlignNode};
2524-
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2525-
Ops));
2526-
ByValTemporaries[ArgIdx] = Dst;
2598+
SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2599+
ByValCopyChains.push_back(
2600+
DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2601+
ByValTemporaries[ArgIdx] = Temp;
25272602
}
25282603
}
2529-
if (!MemOpChains.empty()) {
2530-
ByValTempChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2531-
MemOpChains.clear();
2532-
}
2604+
if (!ByValCopyChains.empty())
2605+
ByValTempChain =
2606+
DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
25332607
}
25342608

25352609
// During a tail call, stores to the argument area must happen after all of
@@ -2644,13 +2718,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
26442718
unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
26452719

26462720
SDValue ByValSrc;
2647-
if (ByValTemporaries.contains(realArgIdx))
2721+
bool NeedsStackCopy;
2722+
if (ByValTemporaries.contains(realArgIdx)) {
26482723
ByValSrc = ByValTemporaries[realArgIdx];
2649-
else
2724+
NeedsStackCopy = true;
2725+
} else {
26502726
ByValSrc = Arg;
2727+
NeedsStackCopy = !isTailCall;
2728+
}
26512729

2730+
// If part of the argument is in registers, load them.
26522731
if (CurByValIdx < ByValArgsCount) {
2653-
26542732
unsigned RegBegin, RegEnd;
26552733
CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
26562734

@@ -2674,7 +2752,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
26742752
CCInfo.nextInRegsParam();
26752753
}
26762754

2677-
if (Flags.getByValSize() > 4*offset) {
2755+
// If the memory part of the argument isn't already in the correct place
2756+
// (which can happen with tail calls), copy it into the argument area.
2757+
if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
26782758
auto PtrVT = getPointerTy(DAG.getDataLayout());
26792759
SDValue Dst;
26802760
MachinePointerInfo DstInfo;

llvm/lib/Target/ARM/ARMISelLowering.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,19 @@ class VectorType;
395395
// ARMTargetLowering - ARM Implementation of the TargetLowering interface
396396

397397
class ARMTargetLowering : public TargetLowering {
398+
// Copying needed for an outgoing byval argument.
399+
enum ByValCopyKind {
400+
// Argument is already in the correct location, no copy needed.
401+
NoCopy,
402+
// Argument value is currently in the local stack frame, needs copying to
403+
// outgoing arguemnt area.
404+
CopyOnce,
405+
// Argument value is currently in the outgoing argument area, but not at
406+
// the correct offset, so needs copying via a temporary in local stack
407+
// space.
408+
CopyViaTemp,
409+
};
410+
398411
public:
399412
explicit ARMTargetLowering(const TargetMachine &TM,
400413
const ARMSubtarget &STI);
@@ -811,6 +824,9 @@ class VectorType;
811824
computeAddrForCallArg(const SDLoc &dl, SelectionDAG &DAG,
812825
const CCValAssign &VA, SDValue StackPtr,
813826
bool IsTailCall, int SPDiff) const;
827+
ByValCopyKind ByValNeedsCopyForTailCall(SelectionDAG &DAG, SDValue Src,
828+
SDValue Dst,
829+
ISD::ArgFlagsTy Flags) const;
814830
SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
815831
SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
816832
SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;

0 commit comments

Comments
 (0)