Skip to content

AMDGPU/GlobalISel: Uniformity info based regbankselect #73684

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,26 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
MachineUniformityInfo Uniformity =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
!ST.isSingleLaneExecution(F));
(void)Uniformity; // TODO: Use this

// Switch for uniformity info based regbank selection. Pre-selects register
// bank on dst registers using machine uniformity analysis.
// Keep in sync with switches in getInstrMapping and applyMappingImpl.
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
switch (MI.getOpcode()) {
case AMDGPU::G_FADD: {
Register Dst = MI.getOperand(0).getReg();
if (Uniformity.isUniform(Dst))
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
else
MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
break;
}
default:
break;
}
}
}

assignRegisterBanks(MF);

Expand Down
191 changes: 154 additions & 37 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,58 +700,96 @@ static LLT getHalfSizedType(LLT Ty) {

// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
// source value into a scalar register.
Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
Register Src) const {
Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
Register Src) const {
MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(Src);
const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);

if (Bank == &AMDGPU::SGPRRegBank)
return Src;

unsigned Bits = Ty.getSizeInBits();
assert(Bits % 32 == 0);

if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Src = B.buildCopy(Ty, Src).getReg(0);
MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
}

return buildReadFirstLaneForType(B, Ty, Src).getReg(0);
}

// Create new vgpr destination register for MI then move it to current
// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
MachineInstr &MI) const {
MachineRegisterInfo &MRI = *B.getMRI();
Register Dst = MI.getOperand(0).getReg();
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
if (DstBank != &AMDGPU::SGPRRegBank)
return;

Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);

MI.getOperand(0).setReg(VgprDst);
MachineBasicBlock *MBB = MI.getParent();
B.setInsertPt(*MBB, std::next(MI.getIterator()));
// readFirstLane VgprDst into Dst after MI.
buildReadFirstLaneForType(B, Dst, VgprDst);
return;
}

MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneB32(
MachineIRBuilder &B, const DstOp &SgprDst, const SrcOp &VgprSrc) const {
MachineRegisterInfo &MRI = *B.getMRI();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: can MRI & B be store inside RegisterBankInfo to reduce the need to pass them around that much?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:) indeed, register bank info should use same builder everywhere. It is a pretty large change

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we mention MachineIRBuilder here how do we want to implement it?
I guess we actually want cse builder.
What about observers, do we add them in base class or is it something targets should do if they want?

auto RFL = B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {SgprDst}, {VgprSrc});
MRI.setRegClass(RFL.getReg(0), &AMDGPU::SReg_32RegClass);
MRI.setRegClass(RFL.getReg(1), &AMDGPU::VGPR_32RegClass);
return RFL;
}

MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
MachineIRBuilder &B, const DstOp &SgprDst, const SrcOp &VgprSrc,
unsigned NumElts) const {
MachineRegisterInfo &MRI = *B.getMRI();
LLT S32 = LLT::scalar(32);
unsigned NumParts = Bits / 32;
SmallVector<Register, 8> SrcParts;
SmallVector<Register, 8> DstParts;
SmallVector<Register, 8> SgprDstParts;

if (Bits == 32) {
SrcParts.push_back(Src);
} else {
auto Unmerge = B.buildUnmerge(S32, Src);
for (unsigned i = 0; i < NumParts; ++i)
SrcParts.push_back(Unmerge.getReg(i));
auto Unmerge = B.buildUnmerge(S32, VgprSrc);
for (unsigned i = 0; i < NumElts; ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i -> I

SgprDstParts.push_back(
buildReadFirstLaneB32(B, S32, Unmerge.getReg(i)).getReg(0));
}

for (unsigned i = 0; i < NumParts; ++i) {
Register SrcPart = SrcParts[i];
Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
MRI.setRegBank(Merge.getReg(0), AMDGPU::SGPRRegBank);
return Merge;
}

const TargetRegisterClass *Constrained =
constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
(void)Constrained;
assert(Constrained && "Failed to constrain readfirstlane src reg");
MachineInstrBuilder AMDGPURegisterBankInfo::buildReadFirstLaneForType(
MachineIRBuilder &B, const DstOp &SgprDst, const SrcOp &VgprSrc) const {
MachineRegisterInfo &MRI = *B.getMRI();
LLT S16 = LLT::scalar(16);
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
LLT Ty = SgprDst.getLLTTy(MRI);

B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
if (Ty == S16) {
return B.buildTrunc(
SgprDst, buildReadFirstLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc)));
}

DstParts.push_back(DstPart);
if (Ty == S32 || (Ty.isPointer() && Ty.getSizeInBits() == 32)) {
return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
}

if (Bits == 32)
return DstParts[0];
if (Ty == S64 || (Ty.isPointer() && Ty.getSizeInBits() == 64)) {
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2);
}

if (Ty.isVector() && Ty.getElementType() == S32) {
return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc,
Ty.getNumElements());
}

Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
return Dst;
llvm_unreachable("Type not supported");
}

/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
Expand Down Expand Up @@ -888,7 +926,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setMBB(*LoopBB);
}

Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
Register CurrentLaneReg = buildReadFirstLaneSrc(B, OpReg);

// Build the comparison(s).
unsigned OpSize = OpTy.getSizeInBits();
Expand Down Expand Up @@ -1020,10 +1058,21 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
if (Bank == &AMDGPU::SGPRRegBank)
return;

Reg = buildReadFirstLane(B, MRI, Reg);
Reg = buildReadFirstLaneSrc(B, Reg);
MI.getOperand(OpIdx).setReg(Reg);
}

// MI has uniform inputs and output but only available machine instruction has
// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
MachineIRBuilder &B, MachineInstr &MI,
const OperandsMapper &OpdMapper) const {
const RegisterBank *DstBank =
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (DstBank != &AMDGPU::VGPRRegBank)
buildReadFirstLaneDst(B, MI);
}

/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
/// rest will be in the remainder.
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
Expand Down Expand Up @@ -1603,7 +1652,7 @@ bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);

if (!DstOnValu) {
DstHi = buildReadFirstLane(B, MRI, DstHi);
DstHi = buildReadFirstLaneSrc(B, DstHi);
} else {
MulHiInVgpr = true;
}
Expand Down Expand Up @@ -2180,6 +2229,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.setInstrAndDebugLoc(MI);
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();

// Switch for uniformity info based regbank selection.
// Keep in sync with switches in AMDGPURegBankSelect and getInstrMapping.
switch (Opc) {
case AMDGPU::G_FADD: {
applyDefaultMapping(OpdMapper);
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
if (!Subtarget.hasSALUFloatInsts() || (Size != 32 && Size != 16))
constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
return;
}
default:
break;
}

switch (Opc) {
case AMDGPU::G_CONSTANT:
case AMDGPU::G_IMPLICIT_DEF: {
Expand Down Expand Up @@ -3556,6 +3620,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
MI.getNumOperands());
}

const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
const MachineInstr &MI) const {
SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
// Dst reg bank should have been set already by uniformity info
OpdsMapping[0] =
getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);

for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i -> I, e -> E

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see both lower case and upper case i and I used equally often. Is there a particular reason why to use upper case I?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where in the codebase do you see i ? A quick search for i = shows no result for me
In any case, coding style says variables should be uppercase so I think it's better to follow the coding style

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From /llvm/lib/
unsigned I = 1185 int I = 276
unsigned i = 2833 int i = 551

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, sure, but I still think that the coding style should be followed: https://llvm.org/docs/CodingStandards.html#name-types-functions-variables-and-enumerators-properly
So I is technically correct (the best kind of correct :) )

OTOH, this is really a microscopic issue (after all this is just a loop count) and I'm not going to reject the PR over that so do whatever you prefer in the end

const MachineOperand &Op = MI.getOperand(i);
if (!Op.isReg())
continue;

unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}

const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
Expand Down Expand Up @@ -3708,6 +3794,20 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}

const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getPreAssignedOpMapping(
Register Reg, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const {
const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
assert(Bank);
unsigned BankId = Bank->getID();
unsigned Size = getSizeInBits(Reg, MRI, TRI);
assert(BankId == AMDGPU::SGPRRegBankID ||
BankId == (Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID));

return AMDGPU::getValueMapping(BankId, Size);
}

const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
Expand Down Expand Up @@ -3824,6 +3924,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {

SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());

// Switch for uniformity info based regbank selection.
// Requires pre-selected, by AMDGPURegBankSelect, reg-banks on dst registers.
// Keep in sync with switches in AMDGPURegBankSelect and applyMappingImpl.
switch (MI.getOpcode()) {
case AMDGPU::G_FADD: {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this switch need to match what's in AMDGPURegBankSelect & applyMappingImpl ? If so, please add a comment to all the switches to note that they must be synced.

Register Dst = MI.getOperand(0).getReg();
unsigned Size = MRI.getType(Dst).getSizeInBits();
const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
assert(DstBank);
if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
DstBank == &AMDGPU::SGPRRegBank)
return getDefaultMappingSOP(MI);
return getDefaultMappingVOPWithPreassignedDef(MI);
}
default:
break;
}

switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
Expand Down Expand Up @@ -3921,7 +4039,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
Expand Down
32 changes: 30 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H

#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/RegisterBankInfo.h"
Expand Down Expand Up @@ -57,14 +58,32 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs) const;

Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Src) const;
Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;

void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;

MachineInstrBuilder buildReadFirstLaneForType(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc) const;

MachineInstrBuilder buildReadFirstLaneB32(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc) const;

MachineInstrBuilder buildReadFirstLaneSequenceOfB32(MachineIRBuilder &B,
const DstOp &SgprDst,
const SrcOp &VgprSrc,
unsigned NumElts) const;

bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI,
ArrayRef<unsigned> OpIndices) const;

void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
unsigned OpIdx) const;
void
constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
const OperandsMapper &OpdMapper) const;

bool applyMappingDynStackAlloc(MachineIRBuilder &B,
const OperandsMapper &OpdMapper,
MachineInstr &MI) const;
Expand Down Expand Up @@ -116,6 +135,12 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const;

// Return a value mapping for an operand that is same as already assigned
// reg bank or corresponds to assigned register class + LLT
const ValueMapping *
getPreAssignedOpMapping(Register Reg, const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const;

// Return a value mapping for an operand that is required to be a AGPR.
const ValueMapping *getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
Expand Down Expand Up @@ -155,6 +180,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {

const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
const InstructionMapping &
getDefaultMappingVOPWithPreassignedDef(const MachineInstr &MI) const;

const InstructionMapping &getDefaultMappingAllVGPR(
const MachineInstr &MI) const;

Expand Down
Loading
Loading