-
Notifications
You must be signed in to change notification settings - Fork 13.5k
AMDGPU/GlobalISel: Uniformity info based regbankselect #73684
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
AMDGPU/GlobalISel: Uniformity info based regbankselect #73684
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Petar Avramovic (petar-avramovic) ChangesRFC for approach in rewriting reg-bank-selection using machine uniformity info analysis. Patch is 241.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73684.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index 2ea03ddb1fccd6c..ca9c70e7ed4a017 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -15,7 +15,10 @@
#include "AMDGPURegBankSelect.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "regbankselect"
@@ -68,7 +71,24 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
MachineUniformityInfo Uniformity =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
!ST.isSingleLaneExecution(F));
- (void)Uniformity; // TODO: Use this
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_FADD: {
+ Register Dst = MI.getOperand(0).getReg();
+ if (Uniformity.isUniform(Dst)) {
+ MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
+ } else {
+ MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ }
assignRegisterBanks(MF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79fb..b8d6a3bd3c0e6fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -695,58 +695,108 @@ static LLT getHalfSizedType(LLT Ty) {
// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
// source value into a scalar register.
-Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register Src) const {
+Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
+ Register Src) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(Src);
const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
- if (Bank == &AMDGPU::SGPRRegBank)
- return Src;
-
- unsigned Bits = Ty.getSizeInBits();
- assert(Bits % 32 == 0);
-
if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Src = B.buildCopy(Ty, Src).getReg(0);
MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
}
+ Register Dst = MRI.createGenericVirtualRegister(Ty);
+ MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
+ buildReadFirstLaneForType(B, Dst, Src);
+ return Dst;
+}
+
+// Create new vgpr destination register for MI then move it to current
+// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
+void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
+ if (DstBank != &AMDGPU::SGPRRegBank)
+ return;
+
+ Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
+ MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);
+
+ MI.getOperand(0).setReg(VgprDst);
+ MachineBasicBlock *MBB = MI.getParent();
+ B.setInsertPt(*MBB, std::next(MI.getIterator()));
+ // readFirstLane VgprDst into Dst after MI.
+ return buildReadFirstLaneForType(B, Dst, VgprDst);
+}
+
+void AMDGPURegisterBankInfo::buildReadFirstLaneB32(MachineIRBuilder &B,
+ Register SgprDst,
+ Register VgprSrc) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {SgprDst}, {VgprSrc});
+ MRI.setRegClass(VgprSrc, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(SgprDst, &AMDGPU::SReg_32RegClass);
+}
+
+void AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
+ MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ unsigned NumElts) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
LLT S32 = LLT::scalar(32);
- unsigned NumParts = Bits / 32;
- SmallVector<Register, 8> SrcParts;
- SmallVector<Register, 8> DstParts;
+ SmallVector<Register, 8> VgprSrcParts;
+ SmallVector<Register, 8> SgprDstParts;
- if (Bits == 32) {
- SrcParts.push_back(Src);
- } else {
- auto Unmerge = B.buildUnmerge(S32, Src);
- for (unsigned i = 0; i < NumParts; ++i)
- SrcParts.push_back(Unmerge.getReg(i));
+ for (unsigned i = 0; i < NumElts; ++i) {
+ VgprSrcParts.push_back(MRI.createGenericVirtualRegister(S32));
+ SgprDstParts.push_back(MRI.createGenericVirtualRegister(S32));
}
- for (unsigned i = 0; i < NumParts; ++i) {
- Register SrcPart = SrcParts[i];
- Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
+ B.buildUnmerge(VgprSrcParts, VgprSrc);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ buildReadFirstLaneB32(B, SgprDstParts[i], VgprSrcParts[i]);
+ }
+ B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+}
- const TargetRegisterClass *Constrained =
- constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
- (void)Constrained;
- assert(Constrained && "Failed to constrain readfirstlane src reg");
+void AMDGPURegisterBankInfo::buildReadFirstLaneForType(MachineIRBuilder &B,
+ Register SgprDst,
+ Register VgprSrc) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT S16 = LLT::scalar(16);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ LLT Ty = MRI.getType(SgprDst);
+
+ if (Ty == S16) {
+ Register VgprSrc32 = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(VgprSrc32, AMDGPU::VGPRRegBank);
+ Register SgprDst32 = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(SgprDst32, AMDGPU::SGPRRegBank);
+
+ B.buildAnyExt(VgprSrc32, VgprSrc);
+ buildReadFirstLaneB32(B, SgprDst32, VgprSrc32);
+ B.buildTrunc(SgprDst, SgprDst32);
+ return;
+ }
- B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
+ if (Ty == S32 || Ty == LLT::pointer(3, 32)) {
+ return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
+ }
- DstParts.push_back(DstPart);
+ if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64)) {
+ return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2);
}
- if (Bits == 32)
- return DstParts[0];
+ if (Ty.isVector() && Ty.getElementType() == S32) {
+ return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc,
+ Ty.getNumElements());
+ }
- Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
- MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
- return Dst;
+ llvm_unreachable("Type not supported");
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
@@ -883,7 +933,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setMBB(*LoopBB);
}
- Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
+ Register CurrentLaneReg = buildReadFirstLaneSrc(B, OpReg);
// Build the comparison(s).
unsigned OpSize = OpTy.getSizeInBits();
@@ -1015,10 +1065,23 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
if (Bank == &AMDGPU::SGPRRegBank)
return;
- Reg = buildReadFirstLane(B, MRI, Reg);
+ Reg = buildReadFirstLaneSrc(B, Reg);
MI.getOperand(OpIdx).setReg(Reg);
}
+// MI has uniform inputs and output but only available machine instruction has
+// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
+void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
+ MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const {
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank != &AMDGPU::VGPRRegBank)
+ buildReadFirstLaneDst(B, MI);
+
+ return;
+}
+
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
/// rest will be in the remainder.
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
@@ -1591,7 +1654,7 @@ bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
if (!DstOnValu) {
- DstHi = buildReadFirstLane(B, MRI, DstHi);
+ DstHi = buildReadFirstLaneSrc(B, DstHi);
} else {
MulHiInVgpr = true;
}
@@ -2100,6 +2163,17 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.setInstrAndDebugLoc(MI);
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ switch (Opc) {
+ case AMDGPU::G_FADD:
+ applyDefaultMapping(OpdMapper);
+ if (!Subtarget.hasSALUFloatInsts())
+ constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
+ return;
+ default:
+ break;
+ }
+
switch (Opc) {
case AMDGPU::G_CONSTANT:
case AMDGPU::G_IMPLICIT_DEF: {
@@ -3355,6 +3429,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
MI.getNumOperands());
}
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
+ const MachineInstr &MI) const {
+ SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ // Dst reg bank should have been set already by uniformity info
+ OpdsMapping[0] =
+ getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
+ unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+ }
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
+}
+
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -3507,6 +3603,22 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getPreAssignedOpMapping(
+ Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+ assert(Bank);
+ unsigned BankId = Bank->getID();
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ if (Size != 1)
+ assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VGPRRegBankID);
+ else
+ assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VCCRegBankID);
+
+ return AMDGPU::getValueMapping(BankId, Size);
+}
+
const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
@@ -3623,6 +3735,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ // Switch for uniformity info based regbank selection.
+ // Does not inspect register bank on incoming operands.
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_FADD: {
+ if (!Subtarget.hasSALUFloatInsts())
+ return getDefaultMappingVOPWithPreassignedDef(MI);
+
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ default:
+ break;
+ }
+
switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
@@ -3718,7 +3847,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
- case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index b5d16e70ab23a20..c4a130eb147fea7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -57,14 +57,29 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs) const;
- Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- Register Src) const;
+ Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;
+
+ void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;
+
+ void buildReadFirstLaneForType(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc) const;
+
+ void buildReadFirstLaneB32(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc) const;
+
+ void buildReadFirstLaneSequenceOfB32(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc,
+ unsigned NumElts) const;
bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI,
ArrayRef<unsigned> OpIndices) const;
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
unsigned OpIdx) const;
+ void
+ constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const;
+
bool applyMappingDynStackAlloc(MachineIRBuilder &B,
const OperandsMapper &OpdMapper,
MachineInstr &MI) const;
@@ -113,6 +128,12 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const;
+ // Return a value mapping for an operand that is same as already assigned
+ // reg bank or corresponds to assigned register class + LLT
+ const ValueMapping *
+ getPreAssignedOpMapping(Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
// Return a value mapping for an operand that is required to be a AGPR.
const ValueMapping *getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
@@ -152,6 +173,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
+ const InstructionMapping &
+ getDefaultMappingVOPWithPreassignedDef(const MachineInstr &MI) const;
+
const InstructionMapping &getDefaultMappingAllVGPR(
const MachineInstr &MI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index f3e561578363967..8c981f92c921272 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -49,21 +49,31 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v2
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -90,23 +100,35 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_c...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
extra small nit: Title should use [AMDGPU][GlobalISel]
just to make git log easier to search
Also description is a bit confusing, do you intend to add all the opcodes in this commit if the direction is good to go?
Or will this only implement G_FADD and others will come in other patches?
In any case please update the description to reflect the intention
if (Uniformity.isUniform(Dst)) { | ||
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID)); | ||
} else { | ||
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID)); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{}
isn't needed
Also if this is going to be expanded for many opcodes, might be worth putting RBI->getRegBank(AMDGPU::SGPRRegBankID)
and the VGPR one into variables to slightly reduce verbosity
auto Unmerge = B.buildUnmerge(S32, Src); | ||
for (unsigned i = 0; i < NumParts; ++i) | ||
SrcParts.push_back(Unmerge.getReg(i)); | ||
for (unsigned i = 0; i < NumElts; ++i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i
-> I
void AMDGPURegisterBankInfo::buildReadFirstLaneB32(MachineIRBuilder &B, | ||
Register SgprDst, | ||
Register VgprSrc) const { | ||
MachineRegisterInfo &MRI = *B.getMRI(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can MRI & B be store inside RegisterBankInfo
to reduce the need to pass them around that much?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
:) indeed, register bank info should use same builder everywhere. It is a pretty large change
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we mention MachineIRBuilder here how do we want to implement it?
I guess we actually want cse builder.
What about observers, do we add them in base class or is it something targets should do if they want?
for (unsigned i = 0; i < NumElts; ++i) { | ||
buildReadFirstLaneB32(B, SgprDstParts[i], VgprSrcParts[i]); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i
-> I
{}
not needed
if (Ty == S32 || Ty == LLT::pointer(3, 32)) { | ||
return buildReadFirstLaneB32(B, SgprDst, VgprSrc); | ||
} | ||
|
||
DstParts.push_back(DstPart); | ||
if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64)) { | ||
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{}
not needed x2
if (DstBank != &AMDGPU::VGPRRegBank) | ||
buildReadFirstLaneDst(B, MI); | ||
|
||
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
extra return
OpdsMapping[0] = | ||
getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI); | ||
|
||
for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i
-> I
, e
-> E
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see both lower case and upper case i
and I
used equally often. Is there a particular reason why to use upper case I
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Where in the codebase do you see i
? A quick search for i =
shows no result for me
In any case, coding style says variables should be uppercase so I think it's better to follow the coding style
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From /llvm/lib/
unsigned I =
1185 int I =
276
unsigned i =
2833 int i =
551
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah right, sure, but I still think that the coding style should be followed: https://llvm.org/docs/CodingStandards.html#name-types-functions-variables-and-enumerators-properly
So I
is technically correct (the best kind of correct :) )
OTOH, this is really a microscopic issue (after all this is just a loop count) and I'm not going to reject the PR over that so do whatever you prefer in the end
if (Size != 1) | ||
assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VGPRRegBankID); | ||
else | ||
assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VCCRegBankID); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd prefer this, but it's really a small nit so pick whatever you like:
assert(BankId == AMDGPU::SGPRRegBankID || BankId == (Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID));
// Switch for uniformity info based regbank selection. | ||
// Does not inspect register bank on incoming operands. | ||
switch (MI.getOpcode()) { | ||
case AMDGPU::G_FADD: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this switch need to match what's in AMDGPURegBankSelect
& applyMappingImpl
? If so, please add a comment to all the switches to note that they must be synced.
2988086
to
766fa9f
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
766fa9f
to
3868373
Compare
Is somebody else able to take on this review please? |
3868373
to
a8e9007
Compare
|
||
// Switch for uniformity info based regbank selection. Pre-selects register | ||
// bank on dst registers using machine uniformity analysis. | ||
// Keep in sinc with switches in getInstrMapping and applyMappingImpl. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Keep in sinc with switches in getInstrMapping and applyMappingImpl. | |
// Keep in sync with switches in getInstrMapping and applyMappingImpl. |
Register SgprDst32 = MRI.createGenericVirtualRegister(S32); | ||
MRI.setRegBank(SgprDst32, AMDGPU::SGPRRegBank); | ||
|
||
B.buildAnyExt(VgprSrc32, VgprSrc); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fold the register creation into the build calls, can set the bank after
} | ||
|
||
if (Bits == 32) | ||
return DstParts[0]; | ||
if (Ty == S32 || Ty == LLT::pointer(3, 32)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't use hardcoded pointer type. This isn't the only 32-bit pointer type either. Plus missing <2 x s16>
I will probably have to search for another opcode or not support gfx12 as this patch leaves regbankselect in semi-broken state. In the meantime, as a remainder, here is a summary of this and upcoming patches:
Reg-bank-select does not change pre-assigned reg-banks on registers. Based on available instructions it can allocate additional registers and insert copies and readfirstlanes.
Do you agree with the approach in general? Later there are options for optimizations. |
a8e9007
to
142a8cf
Compare
Refactor helpers that build readfirstlane for input registers. Required by upcoming patches thet need to build readfirstlane for output registers.
Current algorithm only considers register banks for inputs but does take control flow into account. This is wrong in cases where inputs are uniform (in sgpr) but because of divergent control flow instruction is divergent and should use vgpr instead of sgpr register banks. Most notable example are phis. Also in cases where only available machine instruction uses vgpr registers uniform instructions end up using vgpr register banks. Start with simple implementation for G_FADD. Pre-select register bank for destination register using machine uniformity analysis info. Then select register banks that would allow selection of available machine instructions. For G_FADD vgpr machine instruction is available on all targets but sgpr version is not. When there is no sgpr version assign vgpr register banks and move vgpr destination to sgpr using readfirstlane.
142a8cf
to
a32e0ff
Compare
RFC for approach in rewriting reg-bank-selection using machine uniformity info analysis.
Implemented only for one opcode in this patch (G_FADD).
Current proposal in to pre-select register banks on dst registers using MUI before regbankselection starts.
Then using pre-assigned dst reg banks, instead of looking into register banks for inputs, select register banks for instructions so that available machine instructions could be instruction-selected.
Switch to uniformity info based regbank select could go step by step for groups of opcodes with similar operands since it does not break anything (until we get to phis).