Skip to content

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 11 additions & 13 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1803,19 +1803,16 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
1 : VSrc_b32);
}

// Returns the vreg register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
RegisterOperand ret =
!cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
!eq(VT.Size, 48) : RegisterOperand<VReg_64>,
!eq(VT.Size, 16) : !if(IsTrue16,
!if(IsFake16, RegisterOperand<VGPR_32>,
RegisterOperand<VGPR_16>),
RegisterOperand<VGPR_32>),
1 : RegisterOperand<VGPR_32>);
// VGPR only VOP3 src with 9 bit encoding
class getVOP3VRegSrcForVT<ValueType VT> {
RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
!eq(VT.Size, 512) : VRegSrc_512,
!eq(VT.Size, 256) : VRegSrc_256,
!eq(VT.Size, 192) : VRegSrc_192,
!eq(VT.Size, 128) : VRegSrc_128,
!eq(VT.Size, 96) : VRegSrc_96,
!eq(VT.Size, 64) : VRegSrc_64,
1 : VRegSrc_32);
}

// Src2 of VOP3 DPP instructions cannot be a literal
Expand Down Expand Up @@ -2852,6 +2849,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
Expand Down
17 changes: 16 additions & 1 deletion llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -418,12 +418,27 @@ def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;

// Returns the vreg register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
RegisterOperand ret =
!cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
!eq(VT.Size, 64) : RegisterOperand<VReg_64>,
!eq(VT.Size, 48) : RegisterOperand<VReg_64>,
!eq(VT.Size, 16) : !if(IsTrue16,
!if(IsFake16, RegisterOperand<VGPR_32>,
RegisterOperand<VGPR_16>),
RegisterOperand<VGPR_32>),
1 : RegisterOperand<VGPR_32>);
}

// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
// Src2 must accept the same operand types as vdst, namely VGPRs only
let Src2RC64 = getVOP3VRegSrcForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
let Src2RC64 = getVOP3VRegForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
let Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
Expand Down
16 changes: 14 additions & 2 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1052,7 +1052,11 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}

def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P>
: VOP3_Profile<P, VOP3_OPSEL> {

let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT<P.Src0VT>.ret,
getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
Expand Down Expand Up @@ -1100,6 +1104,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
let HasExt32BitDPP = 0;
let HasExtVOP3DPP = 0;
let HasExt64BitDPP = 0;

// All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
// any operand slots > 32 bit.
let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT<P.Src0VT>.ret,
getVOP3SrcForVT<P.Src0VT>.ret);
}

let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
Expand Down Expand Up @@ -1141,7 +1150,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
defm V_CVT_SCALEF32_SR_PK_FP4_F32
: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32",
VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<
VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
Expand Down
Loading