@@ -2841,6 +2841,144 @@ void SITargetLowering::insertCopiesSplitCSR(
2841
2841
}
2842
2842
}
2843
2843
2844
+ /// Base class for spilling inreg VGPR arguments.
2845
+ ///
2846
+ /// When an argument marked inreg is pushed to a VGPR, it indicates that the
2847
+ /// available SGPRs for argument passing have been exhausted. In such cases, it
2848
+ /// is preferable to pack multiple inreg arguments into individual lanes of
2849
+ /// VGPRs instead of assigning each directly to separate VGPRs.
2850
+ ///
2851
+ /// Spilling involves two parts: the caller-side (call site) and the
2852
+ /// callee-side. Both must follow the same method for selecting registers and
2853
+ /// lanes, ensuring that an argument written at the call site matches exactly
2854
+ /// with the one read at the callee.
2855
+ ///
2856
+ /// \p InregVGPRSpiller::setReg selects the register used for a given argument.
2857
+ /// If \p CurReg is invalid, it uses the register determined by the calling
2858
+ /// convention. The first inreg VGPR argument is stored into lane 0.
2859
+ ///
2860
+ /// After reading or writing an argument, \p InregVGPRSpiller::forward advances
2861
+ /// the lane counter. When all lanes of a VGPR are used, it resets \p CurReg.
2862
+ /// Upon the next read/write operation, the register determined by the calling
2863
+ /// convention will be selected again, and lane numbering will restart from 0.
2864
+ class InregVPGRSpiller {
2865
+ CCState &State;
2866
+ const unsigned WaveFrontSize;
2867
+
2868
+ Register CurReg;
2869
+ unsigned CurLane = 0;
2870
+
2871
+ protected:
2872
+ SelectionDAG &DAG;
2873
+ MachineFunction &MF;
2874
+
2875
+ Register getCurReg() const { return CurReg; }
2876
+ unsigned getCurLane() const { return CurLane % WaveFrontSize; }
2877
+
2878
+ InregVPGRSpiller(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2879
+ : State(State),
2880
+ WaveFrontSize(MF.getSubtarget<GCNSubtarget>().getWavefrontSize()),
2881
+ DAG(DAG), MF(MF) {}
2882
+
2883
+ void setReg(Register &Reg) {
2884
+ if (CurReg.isValid()) {
2885
+ State.DeallocateReg(Reg);
2886
+ Reg = CurReg;
2887
+ } else {
2888
+ CurReg = Reg;
2889
+ }
2890
+ }
2891
+
2892
+ void forward() {
2893
+ // FIXME: Wrapping may never occur here, since that would imply at least 32
2894
+ // or even 64 inreg arguments, which might exceed ABI limitations.
2895
+ if (++CurLane % WaveFrontSize == 0)
2896
+ CurReg = 0;
2897
+ }
2898
+ };
2899
+
2900
+ /// Base class for spilling inreg VGPR arguments.
2901
+ ///
2902
+ /// When an argument marked inreg is pushed to a VGPR, it indicates that the
2903
+ /// available SGPRs for argument passing have been exhausted. In such cases, it
2904
+ /// is preferable to pack multiple inreg arguments into individual lanes of
2905
+ /// VGPRs instead of assigning each directly to separate VGPRs.
2906
+ ///
2907
+ /// Spilling involves two parts: the caller-side (call site) and the
2908
+ /// callee-side. Both must follow the same method for selecting registers and
2909
+ /// lanes, ensuring that an argument written at the call site matches exactly
2910
+ /// with the one read at the callee.
2911
+ class InregVPGRSpillerCallee {
2912
+ CCState &State;
2913
+ SelectionDAG &DAG;
2914
+ MachineFunction &MF;
2915
+
2916
+ Register SrcReg;
2917
+ SDValue SrcVal;
2918
+ unsigned CurLane = 0;
2919
+
2920
+ public:
2921
+ InregVPGRSpillerCallee(SelectionDAG &DAG, MachineFunction &MF, CCState &State)
2922
+ : State(State), DAG(DAG), MF(MF) {}
2923
+
2924
+ SDValue read(SDValue Chain, const SDLoc &SL, Register &Reg, EVT VT) {
2925
+ if (SrcVal) {
2926
+ State.DeallocateReg(Reg);
2927
+ } else {
2928
+ Reg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2929
+ SrcReg = Reg;
2930
+ SrcVal = DAG.getCopyFromReg(Chain, SL, Reg, VT);
2931
+ }
2932
+ // According to the calling convention, only SGPR4–SGPR29 should be used for
2933
+ // passing 'inreg' function arguments. Therefore, the number of 'inreg' VGPR
2934
+ // arguments must not exceed 26.
2935
+ assert(CurLane < 26 && "more than expected VGPR inreg arguments");
2936
+ SmallVector<SDValue, 4> Operands{
2937
+ DAG.getTargetConstant(Intrinsic::amdgcn_readlane, SL, MVT::i32),
2938
+ DAG.getRegister(SrcReg, VT),
2939
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2940
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands);
2941
+ }
2942
+ };
2943
+
2944
+ /// The spilling class for the caller-side that lowers packing of call site
2945
+ /// arguments.
2946
+ class InregVPGRSpillerCallSite {
2947
+ CCState &State;
2948
+
2949
+ Register DstReg;
2950
+ SDValue Glue;
2951
+ unsigned CurLane = 0;
2952
+
2953
+ SelectionDAG &DAG;
2954
+ MachineFunction &MF;
2955
+
2956
+ public:
2957
+ InregVPGRSpillerCallSite(SelectionDAG &DAG, MachineFunction &MF,
2958
+ CCState &State)
2959
+ : State(State), DAG(DAG), MF(MF) {}
2960
+
2961
+ std::pair<SDValue, SDValue> write(SDValue Chain, const SDLoc &SL,
2962
+ Register &Reg, SDValue Val, SDValue InGlue,
2963
+ EVT VT) {
2964
+ if (DstReg.isValid()) {
2965
+ Reg = DstReg;
2966
+ } else {
2967
+ DstReg = Reg;
2968
+ Glue = DAG.getCopyToReg(Chain, SL, Reg, Val, InGlue).getValue(1);
2969
+ }
2970
+ // According to the calling convention, only SGPR4–SGPR29 should be used for
2971
+ // passing 'inreg' function arguments. Therefore, the number of 'inreg' VGPR
2972
+ // arguments must not exceed 26.
2973
+ assert(CurLane < 26 && "more than expected VGPR inreg arguments");
2974
+ SmallVector<SDValue, 4> Operands{
2975
+ DAG.getTargetConstant(Intrinsic::amdgcn_writelane, SL, MVT::i32),
2976
+ DAG.getRegister(DstReg, VT), Val,
2977
+ DAG.getTargetConstant(CurLane++, SL, MVT::i32)};
2978
+ return {DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, VT, Operands), Glue};
2979
+ }
2980
+ };
2981
+
2844
2982
SDValue SITargetLowering::LowerFormalArguments(
2845
2983
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2846
2984
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -2963,6 +3101,7 @@ SDValue SITargetLowering::LowerFormalArguments(
2963
3101
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2964
3102
// kern arg offset.
2965
3103
const Align KernelArgBaseAlign = Align(16);
3104
+ InregVPGRSpillerCallee Spiller(DAG, MF, CCInfo);
2966
3105
2967
3106
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2968
3107
const ISD::InputArg &Arg = Ins[i];
@@ -3130,8 +3269,17 @@ SDValue SITargetLowering::LowerFormalArguments(
3130
3269
llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3131
3270
EVT ValVT = VA.getValVT();
3132
3271
3133
- Reg = MF.addLiveIn(Reg, RC);
3134
- SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3272
+ SDValue Val;
3273
+ // If an argument is marked inreg but gets pushed to a VGPR, it indicates
3274
+ // we've run out of SGPRs for argument passing. In such cases, we'd prefer
3275
+ // to start packing inreg arguments into individual lanes of VGPRs, rather
3276
+ // than placing them directly into VGPRs.
3277
+ if (RC == &AMDGPU::VGPR_32RegClass && Arg.Flags.isInReg()) {
3278
+ Val = Spiller.read(Chain, DL, Reg, VT);
3279
+ } else {
3280
+ Reg = MF.addLiveIn(Reg, RC);
3281
+ Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3282
+ }
3135
3283
3136
3284
if (Arg.Flags.isSRet()) {
3137
3285
// The return object should be reasonably addressable.
@@ -3373,7 +3521,7 @@ SDValue SITargetLowering::LowerCallResult(
3373
3521
// from the explicit user arguments present in the IR.
3374
3522
void SITargetLowering::passSpecialInputs(
3375
3523
CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3376
- SmallVectorImpl<std::pair<unsigned , SDValue>> &RegsToPass,
3524
+ SmallVectorImpl<std::pair<Register , SDValue>> &RegsToPass,
3377
3525
SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3378
3526
// If we don't have a call site, this was a call inserted by
3379
3527
// legalization. These can never use special inputs.
@@ -3817,7 +3965,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3817
3965
}
3818
3966
3819
3967
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3820
- SmallVector<std::pair<unsigned , SDValue>, 8> RegsToPass;
3968
+ SmallVector<std::pair<Register , SDValue>, 8> RegsToPass;
3821
3969
SmallVector<SDValue, 8> MemOpChains;
3822
3970
3823
3971
// Analyze operands of the call, assigning locations to each operand.
@@ -3875,6 +4023,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3875
4023
3876
4024
MVT PtrVT = MVT::i32;
3877
4025
4026
+ InregVPGRSpillerCallSite Spiller(DAG, MF, CCInfo);
4027
+
3878
4028
// Walk the register/memloc assignments, inserting copies/loads.
3879
4029
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3880
4030
CCValAssign &VA = ArgLocs[i];
@@ -3988,8 +4138,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3988
4138
SDValue InGlue;
3989
4139
3990
4140
unsigned ArgIdx = 0;
3991
- for (auto [Reg, Val] : RegsToPass) {
3992
- if (ArgIdx++ >= NumSpecialInputs &&
4141
+ for (auto & [Reg, Val] : RegsToPass) {
4142
+ if (ArgIdx >= NumSpecialInputs &&
3993
4143
(IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
3994
4144
// For chain calls, the inreg arguments are required to be
3995
4145
// uniform. Speculatively Insert a readfirstlane in case we cannot prove
@@ -4008,8 +4158,18 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
4008
4158
ReadfirstlaneArgs);
4009
4159
}
4010
4160
4011
- Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4012
- InGlue = Chain.getValue(1);
4161
+ if (ArgIdx >= NumSpecialInputs &&
4162
+ Outs[ArgIdx - NumSpecialInputs].Flags.isInReg() &&
4163
+ AMDGPU::VGPR_32RegClass.contains(Reg)) {
4164
+ std::tie(Chain, InGlue) =
4165
+ Spiller.write(Chain, DL, Reg, Val, InGlue,
4166
+ ArgLocs[ArgIdx - NumSpecialInputs].getLocVT());
4167
+ } else {
4168
+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4169
+ InGlue = Chain.getValue(1);
4170
+ }
4171
+
4172
+ ++ArgIdx;
4013
4173
}
4014
4174
4015
4175
// We don't usually want to end the call-sequence here because we would tidy
0 commit comments