@@ -2841,6 +2841,96 @@ void SITargetLowering::insertCopiesSplitCSR(
2841
2841
}
2842
2842
}
2843
2843
2844
+ /// Classes for spilling inreg VGPR arguments.
2845
+ ///
2846
+ /// When an argument marked inreg is pushed to a VGPR, it indicates that the
2847
+ /// available SGPRs for argument passing have been exhausted. In such cases, it
2848
+ /// is preferable to pack multiple inreg arguments into individual lanes of
2849
+ /// VGPRs instead of assigning each directly to separate VGPRs.
2850
+ ///
2851
+ /// Spilling involves two parts: the caller-side (call site) and the
2852
+ /// callee-side. Both must follow the same method for selecting registers and
2853
+ /// lanes, ensuring that an argument written at the call site matches exactly
2854
+ /// with the one read at the callee.
2855
+
2856
+ /// The spilling class for the caller-side that lowers packing of call site
2857
+ /// arguments.
2858
+ class InregVPGRSpillerCallee {
2859
+ CCState &State;
2860
+ SelectionDAG &DAG;
2861
+ MachineFunction &MF;
2862
+
2863
+ Register SrcReg;
2864
+ SDValue SrcVal;
2865
+ unsigned CurLane = 0;
2866
+
2867
+ public:
2868
+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2869
+ : State(State), DAG(DAG), MF(MF) {}
2870
+
2871
+ SDValue readLane(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
2872
+ if (SrcVal) {
2873
+ State.DeallocateReg(Reg);
2874
+ } else {
2875
+ Reg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2876
+ SrcReg = Reg;
2877
+ SrcVal = DAG.getCopyFromReg(Chain, SL, Reg, VT);
2878
+ }
2879
+ // According to the calling convention, VGPR0-31 are used for passing
2880
+ // function arguments, no matter they are regular arguments, or 'inreg'
2881
+ // function arguments that get spilled into VGPRs. Therefore, there are at
2882
+ // most 32 'inreg' arguments that can be spilled to VGPRs.
2883
+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
2884
+ SmallVector<SDValue, 4> Operands{
2885
+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
2886
+ DAG.getRegister(SrcReg, VT),
2887
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2888
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2889
+ }
2890
+ };
2891
+
2892
+ /// The spilling class for the caller-side that lowers packing of call site
2893
+ /// arguments.
2894
+ class InregVPGRSpillerCallSite {
2895
+ Register DstReg;
2896
+ SDValue LastWrite;
2897
+ unsigned CurLane = 0;
2898
+
2899
+ SelectionDAG &DAG;
2900
+ MachineFunction &MF;
2901
+
2902
+ public:
2903
+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF)
2904
+ : DAG(DAG), MF(MF) {}
2905
+
2906
+ void writeLane(const SDLoc &SL, Register &Reg, SDValue Val, EVT VT) {
2907
+ if (DstReg.isValid())
2908
+ Reg = DstReg;
2909
+ else
2910
+ DstReg = Reg;
2911
+ // According to the calling convention, VGPR0-31 are used for passing
2912
+ // function arguments, no matter they are regular arguments, or 'inreg'
2913
+ // function arguments that get spilled into VGPRs. Therefore, there are at
2914
+ // most 32 'inreg' arguments that can be spilled to VGPRs.
2915
+ assert(CurLane < 32 && "more than expected VGPR inreg arguments");
2916
+ SmallVector<SDValue, 4> Operands{
2917
+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32), Val,
2918
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2919
+ if (!LastWrite) {
2920
+ Register VReg = MF.getRegInfo().getLiveInVirtReg(DstReg);
2921
+ LastWrite = DAG.getRegister(VReg, VT);
2922
+ }
2923
+ Operands.push_back(LastWrite);
2924
+ LastWrite = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2925
+ }
2926
+
2927
+ SDValue finalize(SDValue Chain, const SDLoc &SL, SDValue InGlue) {
2928
+ if (!LastWrite)
2929
+ return LastWrite;
2930
+ return DAG.getCopyToReg(Chain, SL, DstReg, LastWrite, InGlue);
2931
+ }
2932
+ };
2933
+
2844
2934
SDValue SITargetLowering::LowerFormalArguments(
2845
2935
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2846
2936
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3053,7 @@ SDValue SITargetLowering::LowerFormalArguments(
2963
3053
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2964
3054
// kern arg offset.
2965
3055
const Align KernelArgBaseAlign = Align(16);
3056
+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
2966
3057
2967
3058
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2968
3059
const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3221,17 @@ SDValue SITargetLowering::LowerFormalArguments(
3130
3221
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3131
3222
EVT ValVT = VA.getValVT();
3132
3223
3133
- Reg = MF.addLiveIn(Reg, RC);
3134
- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3224
+ SDValue Val;
3225
+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
3226
+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
3227
+ // to start packing inreg arguments into individual lanes of VGPRs, rather
3228
+ // than placing them directly into VGPRs.
3229
+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
3230
+ Val = Spiller.readLane(Chain, DL, Reg, VT);
3231
+ } else {
3232
+ Reg = MF.addLiveIn(Reg, RC);
3233
+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3234
+ }
3135
3235
3136
3236
if (Arg.Flags.isSRet()) {
3137
3237
// The return object should be reasonably addressable.
@@ -3373,7 +3473,7 @@ SDValue SITargetLowering::LowerCallResult(
3373
3473
// from the explicit user arguments present in the IR.
3374
3474
void SITargetLowering::passSpecialInputs(
3375
3475
CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3376
- SmallVectorImpl<std::pair<unsigned , SDValue>> &RegsToPass,
3476
+ SmallVectorImpl<std::pair<Register , SDValue>> &RegsToPass,
3377
3477
SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3378
3478
// If we don't have a call site, this was a call inserted by
3379
3479
// legalization. These can never use special inputs.
@@ -3817,7 +3917,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3817
3917
}
3818
3918
3819
3919
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3820
- SmallVector<std::pair<unsigned , SDValue>, 8> RegsToPass;
3920
+ SmallVector<std::pair<Register , SDValue>, 8> RegsToPass;
3821
3921
SmallVector<SDValue, 8> MemOpChains;
3822
3922
3823
3923
// Analyze operands of the call, assigning locations to each operand.
@@ -3875,6 +3975,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3875
3975
3876
3976
MVT PtrVT = MVT::i32;
3877
3977
3978
+ InregVPGRSpillerCallSite Spiller(DAG, MF);
3979
+
3878
3980
// Walk the register/memloc assignments, inserting copies/loads.
3879
3981
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3880
3982
CCValAssign &VA = ArgLocs[i];
@@ -3988,8 +4090,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3988
4090
SDValue InGlue;
3989
4091
3990
4092
unsigned ArgIdx = 0;
3991
- for (auto [Reg, Val] : RegsToPass) {
3992
- if (ArgIdx++ >= NumSpecialInputs &&
4093
+ for (auto & [Reg, Val] : RegsToPass) {
4094
+ if (ArgIdx >= NumSpecialInputs &&
3993
4095
(IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3994
4096
// For chain calls, the inreg arguments are required to be
3995
4097
// uniform. Speculatively Insert a readfirstlane in case we cannot prove
@@ -4008,7 +4110,21 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4008
4110
ReadfirstlaneArgs);
4009
4111
}
4010
4112
4011
- Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4113
+ if (ArgIdx >= NumSpecialInputs &&
4114
+ Outs[ArgIdx - NumSpecialInputs].Flags.isInReg() &&
4115
+ AMDGPU::VGPR_32RegClass.contains(Reg)) {
4116
+ Spiller.writeLane(DL, Reg, Val,
4117
+ ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
4118
+ } else {
4119
+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4120
+ InGlue = Chain.getValue(1);
4121
+ }
4122
+
4123
+ ++ArgIdx;
4124
+ }
4125
+
4126
+ if (SDValue R = Spiller.finalize(Chain, DL, InGlue)) {
4127
+ Chain = R;
4012
4128
InGlue = Chain.getValue(1);
4013
4129
}
4014
4130
0 commit comments