Skip to content

Commit 6e722bb

Browse files
authored
[AMDGPU] Support byte_sel modifier on v_cvt_sr_fp8_f32 and v_cvt_sr_bf8_f32 (#90244)
1 parent 300340f commit 6e722bb

17 files changed

+256
-42
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
172172
ImmTyWaitEXP,
173173
ImmTyWaitVAVDst,
174174
ImmTyWaitVMVSrc,
175+
ImmTyByteSel,
175176
};
176177

177178
// Immediate operand kind.
@@ -410,6 +411,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
410411
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
411412
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
412413
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
414+
bool isByteSel() const {
415+
return isImmTy(ImmTyByteSel) && isUInt<2>(getImm());
416+
}
413417

414418
bool isRegOrImm() const {
415419
return isReg() || isImm();
@@ -1139,6 +1143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
11391143
case ImmTyWaitEXP: OS << "WaitEXP"; break;
11401144
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
11411145
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
1146+
case ImmTyByteSel: OS << "ByteSel" ; break;
11421147
}
11431148
// clang-format on
11441149
}
@@ -8644,6 +8649,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
86448649
}
86458650
}
86468651

8652+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
8653+
assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
8654+
Inst.addOperand(Inst.getOperand(0));
8655+
addOptionalImmOperand(Inst, Operands, OptionalIdx,
8656+
AMDGPUOperand::ImmTyByteSel);
8657+
}
8658+
86478659
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
86488660
addOptionalImmOperand(Inst, Operands, OptionalIdx,
86498661
AMDGPUOperand::ImmTyClampSI);
@@ -8680,8 +8692,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
86808692

86818693
if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
86828694
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
8683-
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
8684-
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
8695+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
8696+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
86858697
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
86868698
Inst.addOperand(Inst.getOperand(0));
86878699
}
@@ -8692,7 +8704,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
86928704
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
86938705
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
86948706
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8695-
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
8707+
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
8708+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
8709+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
8710+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
8711+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
86968712
assert(!IsPacked);
86978713
Inst.addOperand(Inst.getOperand(0));
86988714
}
@@ -9207,10 +9223,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
92079223
Inst.addOperand(Inst.getOperand(0));
92089224
}
92099225

9210-
bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
9211-
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
9212-
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
9213-
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
9226+
bool IsVOP3CvtSrDpp =
9227+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
9228+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
9229+
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
9230+
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12;
92149231
if (IsVOP3CvtSrDpp) {
92159232
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
92169233
Inst.addOperand(MCOperand::createImm(0));
@@ -9243,6 +9260,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
92439260
llvm_unreachable("unhandled operand type");
92449261
}
92459262
}
9263+
9264+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
9265+
addOptionalImmOperand(Inst, Operands, OptionalIdx,
9266+
AMDGPUOperand::ImmTyByteSel);
9267+
92469268
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
92479269
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
92489270

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -869,10 +869,6 @@ void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
869869
if (VDstInIdx != -1)
870870
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
871871

872-
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
873-
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
874-
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
875-
876872
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
877873
if (MI.getNumOperands() < DescNumOps &&
878874
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -902,10 +898,6 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
902898
if (VDstInIdx != -1)
903899
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
904900

905-
if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
906-
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
907-
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
908-
909901
unsigned Opc = MI.getOpcode();
910902
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
911903
if (MI.getNumOperands() < DescNumOps &&

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
409409
if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
410410
DPPInst.addImm(NegHiOpr->getImm());
411411
}
412+
auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
413+
if (ByteSelOpr &&
414+
AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
415+
DPPInst.addImm(ByteSelOpr->getImm());
416+
}
412417
}
413418
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
414419
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1806,4 +1806,14 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
18061806
O << ' ' << formatDec(Imm);
18071807
}
18081808

1809+
void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
1810+
const MCSubtargetInfo &STI,
1811+
raw_ostream &O) {
1812+
uint8_t Imm = MI->getOperand(OpNo).getImm();
1813+
if (!Imm)
1814+
return;
1815+
1816+
O << " byte_sel:" << formatDec(Imm);
1817+
}
1818+
18091819
#include "AMDGPUGenAsmWriter.inc"

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
186186
const MCSubtargetInfo &STI, raw_ostream &O);
187187
void printExpTgt(const MCInst *MI, unsigned OpNo,
188188
const MCSubtargetInfo &STI, raw_ostream &O);
189+
void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
190+
raw_ostream &O);
189191

190192
public:
191193
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1123,6 +1123,8 @@ def WaitEXP : NamedIntOperand<i8, "wait_exp">;
11231123
def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
11241124
def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;
11251125

1126+
def ByteSel : NamedIntOperand<i8, "byte_sel">;
1127+
11261128
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
11271129
let OperandNamespace = "AMDGPU";
11281130
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -1700,9 +1702,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
17001702
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
17011703
Src1Mod:$src1_modifiers, Src1RC:$src1,
17021704
clampmod0:$clamp, omod0:$omod),
1703-
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
1704-
Src1Mod:$src1_modifiers, Src1RC:$src1,
1705-
clampmod0:$clamp))
1705+
!con((ins Src0Mod:$src0_modifiers, Src0RC:$src0,
1706+
Src1Mod:$src1_modifiers, Src1RC:$src1),
1707+
!if(HasClamp, (ins clampmod0:$clamp), (ins))))
17061708
/* else */,
17071709
// VOP2 without modifiers
17081710
!if (HasClamp,
@@ -2036,7 +2038,8 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
20362038
class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
20372039
bit HasOpSel, bit HasOMod, bit IsVOP3P,
20382040
bit HasModifiers, bit Src0HasMods,
2039-
bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
2041+
bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
2042+
bit HasByteSel = 0> {
20402043
string dst = !if(HasDst,
20412044
!if(!eq(DstVT.Size, 1),
20422045
"$sdst",
@@ -2058,14 +2061,15 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
20582061
string src1 = !if(Src1HasMods, src1mods, src1nomods);
20592062
string src2 = !if(Src2HasMods, src2mods, src2nomods);
20602063
string opsel = !if(HasOpSel, "$op_sel", "");
2064+
string bytesel = !if(HasByteSel, "$byte_sel", "");
20612065
string 3PMods = !if(IsVOP3P,
20622066
!if(HasOpSel, "$op_sel_hi", "")
20632067
#!if(HasModifiers, "$neg_lo$neg_hi", ""),
20642068
"");
20652069
string clamp = !if(HasClamp, "$clamp", "");
20662070
string omod = !if(HasOMod, "$omod", "");
20672071

2068-
string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#3PMods#clamp#omod, "");
2072+
string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
20692073

20702074
}
20712075

@@ -2282,6 +2286,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22822286
field bit IsSWMMAC = 0;
22832287

22842288
field bit IsFP8 = 0;
2289+
field bit IsFP8DstByteSel = 0;
22852290

22862291
field bit HasDst = !ne(DstVT.Value, untyped.Value);
22872292
field bit HasDst32 = HasDst;
@@ -2401,7 +2406,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
24012406
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
24022407
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
24032408
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
2404-
HasModifiers, DstVT>.ret;
2409+
HasModifiers, DstVT, IsFP8DstByteSel>.ret;
24052410
field string Asm64 = AsmVOP3Base;
24062411
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
24072412
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 40 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -580,6 +580,22 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
580580
HasSrc2FloatMods>.ret>.ret);
581581
}
582582

583+
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
584+
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
585+
let IsFP8DstByteSel = 1;
586+
let HasClamp = 0;
587+
defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
588+
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
589+
HasClamp, HasModifiers, HasSrc2Mods,
590+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
591+
bytesel);
592+
let InsVOP3Base = !con(
593+
getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
594+
Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
595+
Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret,
596+
bytesel);
597+
}
598+
583599
def IsPow2Plus1: PatLeaf<(i32 imm), [{
584600
uint32_t V = N->getZExtValue();
585601
return isPowerOf2_32(V - 1);
@@ -645,12 +661,17 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
645661
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
646662
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
647663
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
664+
665+
let SubtargetPredicate = isGFX12Plus in {
666+
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
667+
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
668+
}
648669
}
649670

650671
// These instructions have non-standard use of op_sel. In particular they are
651672
// using op_sel bits 2 and 3 while only having two sources. Therefore dummy
652673
// src2 is used to hold the op_sel value.
653-
let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
674+
let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
654675
defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
655676
defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
656677
}
@@ -667,15 +688,28 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
667688
!if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
668689
>;
669690

691+
class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType SrcVT> : GCNPat<
692+
(i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
693+
i32:$old, timm:$byte_sel)),
694+
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
695+
>;
696+
670697
let OtherPredicates = [HasFP8ConversionInsts] in {
671698
foreach Index = [0, -1] in {
672699
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
673700
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
674701
}
675702

676-
foreach Index = [0, 1, 2, 3] in {
677-
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
678-
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
703+
let SubtargetPredicate = isGFX940Plus in {
704+
foreach Index = [0, 1, 2, 3] in {
705+
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
706+
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
707+
}
708+
}
709+
710+
let SubtargetPredicate = isGFX12Plus in {
711+
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
712+
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
679713
}
680714
}
681715

@@ -1040,8 +1074,8 @@ defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
10401074

10411075
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
10421076
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
1043-
defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>;
1044-
defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>;
1077+
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
1078+
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
10451079

10461080
//===----------------------------------------------------------------------===//
10471081
// GFX11, GFX12

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,14 @@ class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
311311
let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
312312
}
313313

314+
class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
315+
bits<2> byte_sel;
316+
317+
let Inst{11} = 0; // op_sel0
318+
let Inst{12} = 0; // op_sel1
319+
let Inst{14-13} = byte_sel; // op_sel2/3
320+
}
321+
314322
class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
315323
let Inst{11} = ?;
316324
let Inst{12} = ?;
@@ -741,15 +749,16 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
741749
bits<3> src2_modifiers;
742750
bits<1> clamp;
743751
bits<2> omod;
752+
bits<2> byte_sel;
744753

745754
let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
746755
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
747756
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
748757
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
749758
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
750759
let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
751-
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
752-
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
760+
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
761+
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
753762
let Inst{15} = !if(P.HasClamp, clamp, 0);
754763
let Inst{25-16} = op;
755764
let Inst{31-26} = 0x35;
@@ -1388,7 +1397,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
13881397
bit isSingle = 0> {
13891398
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
13901399
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
1391-
if ps.Pfl.HasOpSel then {
1400+
if ps.Pfl.IsFP8DstByteSel then {
1401+
def _e64#Gen.Suffix :
1402+
VOP3_Real_Gen<ps, Gen>,
1403+
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
1404+
} if ps.Pfl.HasOpSel then {
13921405
def _e64#Gen.Suffix :
13931406
VOP3_Real_Gen<ps, Gen>,
13941407
VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
@@ -1419,6 +1432,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
14191432
def _e64#Gen.Suffix :
14201433
VOP3_Real_Gen<ps, Gen>,
14211434
VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
1435+
} else if ps.Pfl.IsFP8DstByteSel then {
1436+
def _e64#Gen.Suffix :
1437+
VOP3_Real_Gen<ps, Gen>,
1438+
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
14221439
} else if ps.Pfl.HasOpSel then {
14231440
def _e64#Gen.Suffix :
14241441
VOP3_Real_Gen<ps, Gen>,

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
9797
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
9898
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
9999
; GFX12: ; %bb.0:
100-
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
101-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
102-
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
100+
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
103101
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
104102
; GFX12-NEXT: s_nop 0
105103
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,9 +112,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
114112
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
115113
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
116114
; GFX12: ; %bb.0:
117-
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
118-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
119-
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
115+
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
120116
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
121117
; GFX12-NEXT: s_nop 0
122118
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

0 commit comments

Comments
 (0)