-
Notifications
You must be signed in to change notification settings - Fork 14.6k
[AMDGPU] Use alias info to relax waitcounts for LDS DMA #74537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7e38262
bab128b
cb78351
21c55d2
0587c28
ac7a070
7bab56e
d655f12
0d72766
1f8f853
e41607a
0f78060
b86d65b
3f8e9b1
82ec408
92e7977
a5d95fd
4f71d9b
a7abeb8
0115e30
5988047
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ | |
#include "llvm/ADT/MapVector.h" | ||
#include "llvm/ADT/PostOrderIterator.h" | ||
#include "llvm/ADT/Sequence.h" | ||
#include "llvm/Analysis/AliasAnalysis.h" | ||
#include "llvm/CodeGen/MachineLoopInfo.h" | ||
#include "llvm/CodeGen/MachinePostDominators.h" | ||
#include "llvm/InitializePasses.h" | ||
|
@@ -121,8 +122,13 @@ enum RegisterMapping { | |
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets. | ||
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets. | ||
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets. | ||
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS. | ||
EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes. | ||
NUM_EXTRA_VGPRS = 9, // Reserved slots for DS. | ||
// Artificial register slots to track LDS writes into specific LDS locations | ||
// if a location is known. When slots are exhausted or location is | ||
// unknown use the first slot. The first slot is also always updated in | ||
// addition to known location's slot to properly generate waits if dependent | ||
// instruction's location is unknown. | ||
EXTRA_VGPR_LDS = 0, | ||
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts. | ||
}; | ||
|
||
|
@@ -297,6 +303,10 @@ class WaitcntBrackets { | |
PendingEvents |= WaitEventMaskForInst[VS_CNT]; | ||
} | ||
|
||
ArrayRef<const MachineInstr *> getLDSDMAStores() const { | ||
return LDSDMAStores; | ||
} | ||
|
||
void print(raw_ostream &); | ||
void dump() { print(dbgs()); } | ||
|
||
|
@@ -359,6 +369,9 @@ class WaitcntBrackets { | |
// Bitmask of the VmemTypes of VMEM instructions that might have a pending | ||
// write to each vgpr. | ||
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; | ||
// Store representative LDS DMA operations. The only useful info here is | ||
// alias info. One store is kept per unique AAInfo. | ||
SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores; | ||
}; | ||
|
||
class SIInsertWaitcnts : public MachineFunctionPass { | ||
|
@@ -373,6 +386,7 @@ class SIInsertWaitcnts : public MachineFunctionPass { | |
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush; | ||
MachineLoopInfo *MLI; | ||
MachinePostDominatorTree *PDT; | ||
AliasAnalysis *AA = nullptr; | ||
|
||
struct BlockInfo { | ||
std::unique_ptr<WaitcntBrackets> Incoming; | ||
|
@@ -415,6 +429,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { | |
AU.setPreservesCFG(); | ||
AU.addRequired<MachineLoopInfo>(); | ||
AU.addRequired<MachinePostDominatorTree>(); | ||
AU.addUsedIfAvailable<AAResultsWrapperPass>(); | ||
AU.addPreserved<AAResultsWrapperPass>(); | ||
MachineFunctionPass::getAnalysisUsage(AU); | ||
} | ||
|
||
|
@@ -707,7 +723,40 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, | |
(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) { | ||
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS | ||
// written can be accessed. A load from LDS to VMEM does not need a wait. | ||
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); | ||
unsigned Slot = 0; | ||
for (const auto *MemOp : Inst.memoperands()) { | ||
if (!MemOp->isStore() || | ||
MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS) | ||
continue; | ||
// Comparing just AA info does not guarantee memoperands are equal | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want exact equality of location, you would use the Value/PseudoSourceValue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. MI::mayAlias is extremely conservative and does not work with PsedoSourceValue, it just ignores AA tags in this case. The Value is also unreliable because it is really a GEP, always a different GEP. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Speaking of Value, this is how a real program's MIR look like. These are all loads and stores of the same arrays. You may see that Value there is not helpful, only alias scope is (look at the second memory operand, LDS store):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PseudoSourceValue::mayAlias is supposed to report aliasing to possible IR values. It looks like it's layered weirdly, and expects you to go through MachineInstr::mayAlias. MachineInstr::mayAlias ought to be using the AA tags, it shouldn't be a fundamental limitation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks to me like it does use it if you pass UseTBAA=true. Not sure why this would be a parameter in the first place There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is all PSV::mayAlias() does:
No very useful. Then even to get to the AA tags check MI:mayAlias() shall go through all IR values' checks first. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I am passing it, but to get to that check it shall first go through all Value and offset checks. Using AA is the last thing it does: https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I also think that values don't need to be identical. But that is what MI:mayAlias() does before it checks AA: https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But there's no PseudoSourceValue in this example, it should be a straightforward Value-to-Value comparison There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, there is no PSV. I have mentioned PSV because you have earlier suggested to use it. For the real IR value: it is not helpful to compare it. The IR value is a GEP, and this GEP is always different. I.e. these values never compare equal. The rest of the IR is already gone and unavailable for the analysis. Even if it would be available this GEP will address kernel module LDS variable, a single huge LDS array, and will be useless again. In this case it will tell you any LDS operation aliases any other. Now during the module LDS lowering I am creating alias scope info specifically to disambiguate aliasing after the pass has squashed all LDS variables. |
||
// in general, but this is so for LDS DMA in practice. | ||
auto AAI = MemOp->getAAInfo(); | ||
// Alias scope information gives a way to definitely identify an | ||
// original memory object and practically produced in the module LDS | ||
// lowering pass. If there is no scope available we will not be able | ||
// to disambiguate LDS aliasing as after the module lowering all LDS | ||
// is squashed into a single big object. Do not attempt to use one of | ||
// the limited LDSDMAStores for something we will not be able to use | ||
// anyway. | ||
if (!AAI || !AAI.Scope) | ||
break; | ||
for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) { | ||
for (const auto *MemOp : LDSDMAStores[I]->memoperands()) { | ||
if (MemOp->isStore() && AAI == MemOp->getAAInfo()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand the equality check here; you perform an actual mayAlias query later? What is the point of this filter? Don't you need to consider any possibly aliasing write as an event? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alias check is done later at the line 1229, and this is also a place where any aliasing access gets actual waits, read or write. Anyway, that part did not change here. This search is just to find if we already have a slot allocated for this memory location. In reality we do not need instruction, we just need LDS memory location, but since mayAlias is an interface which needs a MachineInstr, I am searching and keeping instructions in the list. All we really need from this instruction is AA tags. |
||
Slot = I + 1; | ||
break; | ||
} | ||
} | ||
} | ||
if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1) | ||
break; | ||
LDSDMAStores.push_back(&Inst); | ||
Slot = LDSDMAStores.size(); | ||
break; | ||
} | ||
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore); | ||
if (Slot) | ||
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore); | ||
} | ||
} | ||
} | ||
|
@@ -1183,9 +1232,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, | |
// No need to wait before load from VMEM to LDS. | ||
if (TII->mayWriteLDSThroughDMA(MI)) | ||
continue; | ||
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; | ||
|
||
// VM_CNT is only relevant to vgpr or LDS. | ||
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); | ||
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; | ||
bool FoundAliasingStore = false; | ||
// Only objects with alias scope info were added to LDSDMAScopes array. | ||
// In the absense of the scope info we will not be able to disambiguate | ||
// aliasing here. There is no need to try searching for a corresponding | ||
// store slot. This is conservatively correct because in that case we | ||
// will produce a wait using the first (general) LDS DMA wait slot which | ||
// will wait on all of them anyway. | ||
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still don't understand the usage of scope; scope isn't special, isn't common and I do at all like specially treating it. I think you should just let the AA query figure out what to do with it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have reserved just 8 pseudo registers to track it. I do not want to fill it with unrelated stuff. I know that the only way AA will be able to handle this very specific situation is if there is scope info, otherwise there is no reason to waste a slot and compile time. If I do not enter this 'if' the pass will just do conservatively correct thing and wait for this memory regardless of aliasing or lack of it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have added more comments to explain this. The place which fills the LDS DMA slot bails if there is no scope info not to waste limited tracking slots. In that case a generic first slot is still used for such operation (it is always used, regardless if we can or cannot be more specific about the underlying object). Here AA will be unable to disambiguate aliasing if there is no scope info, so this condition is simply a shortcut to avoid an expensive loop and AA query. I can remove this part of the condition here and nothing will change except it will work slower. Note that not entering this 'if' statement will always produce a conservatively correct wait using first generic tracking slot, which always gets a score regardless of our ability to track a specific object. The condition is around the relaxation code to avoid a generic and conservative 'wait for everything' part below. |
||
const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores(); | ||
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) { | ||
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) { | ||
FoundAliasingStore = true; | ||
ScoreBrackets.determineWait(VM_CNT, RegNo + I + 1, Wait); | ||
} | ||
} | ||
} | ||
if (!FoundAliasingStore) | ||
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait); | ||
if (Memop->isStore()) { | ||
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait); | ||
} | ||
|
@@ -1834,6 +1901,8 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { | |
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | ||
MLI = &getAnalysis<MachineLoopInfo>(); | ||
PDT = &getAnalysis<MachinePostDominatorTree>(); | ||
if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) | ||
AA = &AAR->getAAResults(); | ||
|
||
ForceEmitZeroWaitcnts = ForceEmitZeroFlag; | ||
for (auto T : inst_counter_types()) | ||
|
Uh oh!
There was an error while loading. Please reload this page.