-
Notifications
You must be signed in to change notification settings - Fork 14.6k
AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for any operand slots providing more than 32-bits of data. Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc Author: Pravin Jagtap (pravinjagtap) ChangesAll convert opcodes operating on FP6/BF6/FP4 Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af Full diff: https://github.com/llvm/llvm-project/pull/127464.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bb78e77a9dc1a..e2d0b28a391fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1818,6 +1818,18 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
1 : RegisterOperand<VGPR_32>);
}
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VSrcReg9ForVT<ValueType VT> {
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VRegSrc_64,
+ 1 : VRegSrc_32);
+}
+
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
@@ -2852,6 +2864,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
+def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index afafc2ecccfaf..6a6d8bd94826f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,7 +1052,10 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}
-def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
@@ -1100,6 +1103,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
let HasExt32BitDPP = 0;
let HasExtVOP3DPP = 0;
let HasExt64BitDPP = 0;
+
+ // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
+ // any operand slots > 32 bit.
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1141,7 +1149,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index e0b832d8fe297..099916f48b5e7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should also have codegen tests that show SGPR to VGPR copies inserted in lowering
Do you mean all of BF8/BF6/FP8/FP6/FP4? Please make the title and description consistent. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/16183 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/12584 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/4962 Here is the relevant piece of the build log for the reference
|
…lvm#127464) The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.
…lvm#127464) The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.
The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.