@@ -2841,6 +2841,96 @@ void SITargetLowering::insertCopiesSplitCSR(
28412841 }
28422842}
28432843
2844+ /// Classes for spilling inreg VGPR arguments.
2845+ ///
2846+ /// When an argument marked inreg is pushed to a VGPR, it indicates that the
2847+ /// available SGPRs for argument passing have been exhausted. In such cases, it
2848+ /// is preferable to pack multiple inreg arguments into individual lanes of
2849+ /// VGPRs instead of assigning each directly to separate VGPRs.
2850+ ///
2851+ /// Spilling involves two parts: the caller-side (call site) and the
2852+ /// callee-side. Both must follow the same method for selecting registers and
2853+ /// lanes, ensuring that an argument written at the call site matches exactly
2854+ /// with the one read at the callee.
2855+
2856+ /// The spilling class for the caller-side that lowers packing of call site
2857+ /// arguments.
2858+ class InregVPGRSpillerCallee {
2859+ CCState &State;
2860+ SelectionDAG &DAG;
2861+ MachineFunction &MF;
2862+
2863+ Register SrcReg;
2864+ SDValue SrcVal;
2865+ unsigned CurLane = 0;
2866+
2867+ public:
2868+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2869+ : State(State), DAG(DAG), MF(MF) {}
2870+
2871+ SDValue readLane(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
2872+ if (SrcVal) {
2873+ State.DeallocateReg(Reg);
2874+ } else {
2875+ Reg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2876+ SrcReg = Reg;
2877+ SrcVal = DAG.getCopyFromReg(Chain, SL, Reg, VT);
2878+ }
2879+ // According to the calling convention, VGPR0-31 are used for passing
2880+ // function arguments, no matter they are regular arguments, or 'inreg'
2881+ // function arguments that get spilled into VGPRs. Therefore, there are at
2882+ // most 32 'inreg' arguments that can be spilled to VGPRs.
2883+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
2884+ SmallVector<SDValue, 4> Operands{
2885+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
2886+ DAG.getRegister(SrcReg, VT),
2887+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2888+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2889+ }
2890+ };
2891+
2892+ /// The spilling class for the caller-side that lowers packing of call site
2893+ /// arguments.
2894+ class InregVPGRSpillerCallSite {
2895+ Register DstReg;
2896+ SDValue LastWrite;
2897+ unsigned CurLane = 0;
2898+
2899+ SelectionDAG &DAG;
2900+ MachineFunction &MF;
2901+
2902+ public:
2903+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF)
2904+ : DAG(DAG), MF(MF) {}
2905+
2906+ void writeLane(const SDLoc &SL, Register &Reg, SDValue Val, EVT VT) {
2907+ if (DstReg.isValid())
2908+ Reg = DstReg;
2909+ else
2910+ DstReg = Reg;
2911+ // According to the calling convention, VGPR0-31 are used for passing
2912+ // function arguments, no matter they are regular arguments, or 'inreg'
2913+ // function arguments that get spilled into VGPRs. Therefore, there are at
2914+ // most 32 'inreg' arguments that can be spilled to VGPRs.
2915+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
2916+ SmallVector<SDValue, 4> Operands{
2917+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32), Val,
2918+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2919+ if (!LastWrite) {
2920+ Register VReg = MF.getRegInfo().getLiveInVirtReg(DstReg);
2921+ LastWrite = DAG.getRegister(VReg, VT);
2922+ }
2923+ Operands.push_back(LastWrite);
2924+ LastWrite = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2925+ }
2926+
2927+ SDValue finalize(SDValue Chain, const SDLoc &SL, SDValue InGlue) {
2928+ if (!LastWrite)
2929+ return LastWrite;
2930+ return DAG.getCopyToReg(Chain, SL, DstReg, LastWrite, InGlue);
2931+ }
2932+ };
2933+
28442934SDValue SITargetLowering::LowerFormalArguments(
28452935 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
28462936 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3053,7 @@ SDValue SITargetLowering::LowerFormalArguments(
29633053 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
29643054 // kern arg offset.
29653055 const Align KernelArgBaseAlign = Align(16);
3056+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
29663057
29673058 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
29683059 const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3221,17 @@ SDValue SITargetLowering::LowerFormalArguments(
31303221 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
31313222 EVT ValVT = VA.getValVT();
31323223
3133- Reg = MF.addLiveIn(Reg, RC);
3134- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3224+ SDValue Val;
3225+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
3226+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
3227+ // to start packing inreg arguments into individual lanes of VGPRs, rather
3228+ // than placing them directly into VGPRs.
3229+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
3230+ Val = Spiller.readLane(Chain, DL, Reg, VT);
3231+ } else {
3232+ Reg = MF.addLiveIn(Reg, RC);
3233+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3234+ }
31353235
31363236 if (Arg.Flags.isSRet()) {
31373237 // The return object should be reasonably addressable.
@@ -3373,7 +3473,7 @@ SDValue SITargetLowering::LowerCallResult(
33733473// from the explicit user arguments present in the IR.
33743474void SITargetLowering::passSpecialInputs(
33753475 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3376- SmallVectorImpl<std::pair<unsigned , SDValue>> &RegsToPass,
3476+ SmallVectorImpl<std::pair<Register , SDValue>> &RegsToPass,
33773477 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
33783478 // If we don't have a call site, this was a call inserted by
33793479 // legalization. These can never use special inputs.
@@ -3817,7 +3917,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38173917 }
38183918
38193919 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3820- SmallVector<std::pair<unsigned , SDValue>, 8> RegsToPass;
3920+ SmallVector<std::pair<Register , SDValue>, 8> RegsToPass;
38213921 SmallVector<SDValue, 8> MemOpChains;
38223922
38233923 // Analyze operands of the call, assigning locations to each operand.
@@ -3875,6 +3975,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38753975
38763976 MVT PtrVT = MVT::i32;
38773977
3978+ InregVPGRSpillerCallSite Spiller(DAG, MF);
3979+
38783980 // Walk the register/memloc assignments, inserting copies/loads.
38793981 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
38803982 CCValAssign &VA = ArgLocs[i];
@@ -3988,8 +4090,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39884090 SDValue InGlue;
39894091
39904092 unsigned ArgIdx = 0;
3991- for (auto [Reg, Val] : RegsToPass) {
3992- if (ArgIdx++ >= NumSpecialInputs &&
4093+ for (auto & [Reg, Val] : RegsToPass) {
4094+ if (ArgIdx >= NumSpecialInputs &&
39934095 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
39944096 // For chain calls, the inreg arguments are required to be
39954097 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
@@ -4008,7 +4110,21 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
40084110 ReadfirstlaneArgs);
40094111 }
40104112
4011- Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4113+ if (ArgIdx >= NumSpecialInputs &&
4114+ Outs[ArgIdx - NumSpecialInputs].Flags.isInReg() &&
4115+ AMDGPU::VGPR_32RegClass.contains(Reg)) {
4116+ Spiller.writeLane(DL, Reg, Val,
4117+ ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
4118+ } else {
4119+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4120+ InGlue = Chain.getValue(1);
4121+ }
4122+
4123+ ++ArgIdx;
4124+ }
4125+
4126+ if (SDValue R = Spiller.finalize(Chain, DL, InGlue)) {
4127+ Chain = R;
40124128 InGlue = Chain.getValue(1);
40134129 }
40144130
0 commit comments