Skip to content

[AMDGPU] Use alias info to relax waitcounts for LDS DMA #74537

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 74 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
Expand Down Expand Up @@ -121,8 +122,13 @@ enum RegisterMapping {
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
// Artificial register slots to track LDS writes into specific LDS locations
// if a location is known. When slots are exhausted or location is
// unknown use the first slot. The first slot is also always updated in
// addition to known location's slot to properly generate waits if dependent
// instruction's location is unknown.
EXTRA_VGPR_LDS = 0,
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};

Expand Down Expand Up @@ -297,6 +303,10 @@ class WaitcntBrackets {
PendingEvents |= WaitEventMaskForInst[VS_CNT];
}

ArrayRef<const MachineInstr *> getLDSDMAStores() const {
return LDSDMAStores;
}

void print(raw_ostream &);
void dump() { print(dbgs()); }

Expand Down Expand Up @@ -359,6 +369,9 @@ class WaitcntBrackets {
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
// Store representative LDS DMA operations. The only useful info here is
// alias info. One store is kept per unique AAInfo.
SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
};

class SIInsertWaitcnts : public MachineFunctionPass {
Expand All @@ -373,6 +386,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
AliasAnalysis *AA = nullptr;

struct BlockInfo {
std::unique_ptr<WaitcntBrackets> Incoming;
Expand Down Expand Up @@ -415,6 +429,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
AU.addRequired<MachinePostDominatorTree>();
AU.addUsedIfAvailable<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand Down Expand Up @@ -707,7 +723,40 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
// written can be accessed. A load from LDS to VMEM does not need a wait.
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
unsigned Slot = 0;
for (const auto *MemOp : Inst.memoperands()) {
if (!MemOp->isStore() ||
MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
continue;
// Comparing just AA info does not guarantee memoperands are equal
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want exact equality of location, you would use the Value/PseudoSourceValue?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MI::mayAlias is extremely conservative and does not work with PsedoSourceValue, it just ignores AA tags in this case. The Value is also unreliable because it is really a GEP, always a different GEP.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speaking of Value, this is how a real program's MIR look like. These are all loads and stores of the same arrays. You may see that Value there is not helpful, only alias scope is (look at the second memory operand, LDS store):

  BUFFER_LOAD_DWORD_LDS_OFFEN %51:vgpr_32, %48:sgpr_128, %52:sreg_32, 0, 0, 0, implicit $exec, implicit $m0 :: (dereferenceable load (s32) from `ptr addrspace(1) poison`, align 1, !alias.scope !7, !noalias !10, addrspace 1), (dereferenceable store (s32) into %ir.10, align 1, !alias.scope !7, !noalias !10, addrspace 3)
  BUFFER_LOAD_DWORD_LDS_OFFEN killed %54:vgpr_32, %48:sgpr_128, %52:sreg_32, 0, 0, 0, implicit $exec, implicit $m0 :: (dereferenceable load (s32) from `ptr addrspace(1) poison`, align 1, !alias.scope !7, !noalias !10, addrspace 1), (dereferenceable store (s32) into %ir.12, align 1, !alias.scope !7, !noalias !10, addrspace 3)
  BUFFER_LOAD_DWORD_LDS_OFFEN killed %56:vgpr_32, %48:sgpr_128, %52:sreg_32, 0, 0, 0, implicit $exec, implicit $m0 :: (dereferenceable load (s32) from `ptr addrspace(1) poison`, align 1, !alias.scope !7, !noalias !10, addrspace 1), (dereferenceable store (s32) into %ir.14, align 1, !alias.scope !7, !noalias !10, addrspace 3)
  BUFFER_LOAD_DWORD_LDS_OFFEN killed %58:vgpr_32, %48:sgpr_128, %52:sreg_32, 0, 0, 0, implicit $exec, implicit $m0 :: (dereferenceable load (s32) from `ptr addrspace(1) poison`, align 1, !alias.scope !7, !noalias !10, addrspace 1), (dereferenceable store (s32) into %ir.16, align 1, !alias.scope !7, !noalias !10, addrspace 3)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PseudoSourceValue::mayAlias is supposed to report aliasing to possible IR values. It looks like it's layered weirdly, and expects you to go through MachineInstr::mayAlias. MachineInstr::mayAlias ought to be using the AA tags, it shouldn't be a fundamental limitation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks to me like it does use it if you pass UseTBAA=true. Not sure why this would be a parameter in the first place

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PseudoSourceValue::mayAlias is supposed to report aliasing to possible IR values. It looks like it's layered weirdly, and expects you to go through MachineInstr::mayAlias. MachineInstr::mayAlias ought to be using the AA tags, it shouldn't be a fundamental limitation

This is all PSV::mayAlias() does:

bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
  return !(isGOT() || isConstantPool() || isJumpTable());
}

No very useful. Then even to get to the AA tags check MI:mayAlias() shall go through all IR values' checks first.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks to me like it does use it if you pass UseTBAA=true. Not sure why this would be a parameter in the first place

I am passing it, but to get to that check it shall first go through all Value and offset checks. Using AA is the last thing it does: https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The values don't need to be identical, that's the point of the AA query. BasicAA will parse through the offsets

I also think that values don't need to be identical. But that is what MI:mayAlias() does before it checks AA: https://llvm.org/doxygen/MachineInstr_8cpp_source.html#l01285

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But there's no PseudoSourceValue in this example, it should be a straightforward Value-to-Value comparison

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, there is no PSV. I have mentioned PSV because you have earlier suggested to use it. For the real IR value: it is not helpful to compare it. The IR value is a GEP, and this GEP is always different. I.e. these values never compare equal. The rest of the IR is already gone and unavailable for the analysis. Even if it would be available this GEP will address kernel module LDS variable, a single huge LDS array, and will be useless again. In this case it will tell you any LDS operation aliases any other. Now during the module LDS lowering I am creating alias scope info specifically to disambiguate aliasing after the pass has squashed all LDS variables.

// in general, but this is so for LDS DMA in practice.
auto AAI = MemOp->getAAInfo();
// Alias scope information gives a way to definitely identify an
// original memory object and practically produced in the module LDS
// lowering pass. If there is no scope available we will not be able
// to disambiguate LDS aliasing as after the module lowering all LDS
// is squashed into a single big object. Do not attempt to use one of
// the limited LDSDMAStores for something we will not be able to use
// anyway.
if (!AAI || !AAI.Scope)
break;
for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the equality check here; you perform an actual mayAlias query later? What is the point of this filter? Don't you need to consider any possibly aliasing write as an event?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alias check is done later at the line 1229, and this is also a place where any aliasing access gets actual waits, read or write. Anyway, that part did not change here. This search is just to find if we already have a slot allocated for this memory location. In reality we do not need instruction, we just need LDS memory location, but since mayAlias is an interface which needs a MachineInstr, I am searching and keeping instructions in the list. All we really need from this instruction is AA tags.

Slot = I + 1;
break;
}
}
}
if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
break;
LDSDMAStores.push_back(&Inst);
Slot = LDSDMAStores.size();
break;
}
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
if (Slot)
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
}
}
}
Expand Down Expand Up @@ -1183,9 +1232,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// No need to wait before load from VMEM to LDS.
if (TII->mayWriteLDSThroughDMA(MI))
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;

// VM_CNT is only relevant to vgpr or LDS.
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
bool FoundAliasingStore = false;
// Only objects with alias scope info were added to LDSDMAScopes array.
// In the absense of the scope info we will not be able to disambiguate
// aliasing here. There is no need to try searching for a corresponding
// store slot. This is conservatively correct because in that case we
// will produce a wait using the first (general) LDS DMA wait slot which
// will wait on all of them anyway.
if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't understand the usage of scope; scope isn't special, isn't common and I do at all like specially treating it. I think you should just let the AA query figure out what to do with it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have reserved just 8 pseudo registers to track it. I do not want to fill it with unrelated stuff. I know that the only way AA will be able to handle this very specific situation is if there is scope info, otherwise there is no reason to waste a slot and compile time. If I do not enter this 'if' the pass will just do conservatively correct thing and wait for this memory regardless of aliasing or lack of it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added more comments to explain this. The place which fills the LDS DMA slot bails if there is no scope info not to waste limited tracking slots. In that case a generic first slot is still used for such operation (it is always used, regardless if we can or cannot be more specific about the underlying object). Here AA will be unable to disambiguate aliasing if there is no scope info, so this condition is simply a shortcut to avoid an expensive loop and AA query. I can remove this part of the condition here and nothing will change except it will work slower. Note that not entering this 'if' statement will always produce a conservatively correct wait using first generic tracking slot, which always gets a score regardless of our ability to track a specific object. The condition is around the relaxation code to avoid a generic and conservative 'wait for everything' part below.

const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
FoundAliasingStore = true;
ScoreBrackets.determineWait(VM_CNT, RegNo + I + 1, Wait);
}
}
}
if (!FoundAliasingStore)
ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
if (Memop->isStore()) {
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
}
Expand Down Expand Up @@ -1834,6 +1901,8 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MLI = &getAnalysis<MachineLoopInfo>();
PDT = &getAnalysis<MachinePostDominatorTree>();
if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
AA = &AAR->getAAResults();

ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
for (auto T : inst_counter_types())
Expand Down
113 changes: 97 additions & 16 deletions llvm/test/CodeGen/AMDGPU/lds-dma-waits.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,23 @@

@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
@lds.3 = internal addrspace(3) global [64 x float] poison, align 16
@lds.4 = internal addrspace(3) global [64 x float] poison, align 16
@lds.5 = internal addrspace(3) global [64 x float] poison, align 16
@lds.6 = internal addrspace(3) global [64 x float] poison, align 16
@lds.7 = internal addrspace(3) global [64 x float] poison, align 16
@lds.8 = internal addrspace(3) global [64 x float] poison, align 16
@lds.9 = internal addrspace(3) global [64 x float] poison, align 16

declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)

; FIXME: vmcnt(0) is too strong, it shall use vmcnt(2) before the first
; ds_read_b32 and vmcnt(0) before the second.

; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
; GCN-COUNT-4: buffer_load_dword
; GCN: s_waitcnt vmcnt(0)
; GCN: s_waitcnt vmcnt(2)
; GCN: ds_read_b32

; FIXME:
; GCN-NOT: s_waitcnt

; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
main_body:
Expand All @@ -43,15 +45,9 @@ main_body:
; GCN-COUNT-4: global_load_dword
; GFX9: s_waitcnt vmcnt(0)
; GFX9-COUNT-2: ds_read_b32

; FIXME: can be vmcnt(2)

; GFX10: s_waitcnt vmcnt(0)
; GFX10: s_waitcnt vmcnt(2)
; GFX10: ds_read_b32

; FIXME:
; GFX10-NOT: s_waitcnt

; GFX10: s_waitcnt vmcnt(0)
; GFX10: ds_read_b32
define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
main_body:
Expand All @@ -70,4 +66,89 @@ main_body:
ret void
}

; There are 8 pseudo registers defined to track LDS DMA dependencies.
; When exhausted we default to vmcnt(0).

; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
; GCN-COUNT-10: buffer_load_dword
; GCN: s_waitcnt vmcnt(8)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(7)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(6)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(5)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(4)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(3)
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(2)
; GCN-NOT: s_waitcnt vmcnt
; GCN: ds_read_b32
; GCN: s_waitcnt vmcnt(0)
; GCN: ds_read_b32
define amdgpu_kernel void @buffer_load_lds_dword_10_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8, i32 %i9, ptr addrspace(1) %out) {
main_body:
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.2, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.3, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.4, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.5, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.6, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.7, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.8, i32 4, i32 0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.9, i32 4, i32 0, i32 0, i32 0, i32 0)
%gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
%gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
%gep.2 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i2
%gep.3 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
%gep.4 = getelementptr float, ptr addrspace(3) @lds.4, i32 %i2
%gep.5 = getelementptr float, ptr addrspace(3) @lds.5, i32 %i2
%gep.6 = getelementptr float, ptr addrspace(3) @lds.6, i32 %i2
%gep.7 = getelementptr float, ptr addrspace(3) @lds.7, i32 %i2
%gep.8 = getelementptr float, ptr addrspace(3) @lds.8, i32 %i2
%gep.9 = getelementptr float, ptr addrspace(3) @lds.9, i32 %i2
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
call void @llvm.amdgcn.wave.barrier()
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
call void @llvm.amdgcn.wave.barrier()
%val.2 = load float, ptr addrspace(3) %gep.2, align 4
call void @llvm.amdgcn.wave.barrier()
%val.3 = load float, ptr addrspace(3) %gep.3, align 4
call void @llvm.amdgcn.wave.barrier()
%val.4 = load float, ptr addrspace(3) %gep.4, align 4
call void @llvm.amdgcn.wave.barrier()
%val.5 = load float, ptr addrspace(3) %gep.5, align 4
call void @llvm.amdgcn.wave.barrier()
%val.6 = load float, ptr addrspace(3) %gep.6, align 4
call void @llvm.amdgcn.wave.barrier()
%val.7 = load float, ptr addrspace(3) %gep.7, align 4
call void @llvm.amdgcn.wave.barrier()
%val.8 = load float, ptr addrspace(3) %gep.8, align 4
call void @llvm.amdgcn.wave.barrier()
%val.9 = load float, ptr addrspace(3) %gep.9, align 4
%out.gep.1 = getelementptr float, ptr addrspace(1) %out, i32 1
%out.gep.2 = getelementptr float, ptr addrspace(1) %out, i32 2
%out.gep.3 = getelementptr float, ptr addrspace(1) %out, i32 3
%out.gep.4 = getelementptr float, ptr addrspace(1) %out, i32 4
%out.gep.5 = getelementptr float, ptr addrspace(1) %out, i32 5
%out.gep.6 = getelementptr float, ptr addrspace(1) %out, i32 6
%out.gep.7 = getelementptr float, ptr addrspace(1) %out, i32 7
%out.gep.8 = getelementptr float, ptr addrspace(1) %out, i32 8
%out.gep.9 = getelementptr float, ptr addrspace(1) %out, i32 9
store float %val.0, ptr addrspace(1) %out
store float %val.1, ptr addrspace(1) %out.gep.1
store float %val.2, ptr addrspace(1) %out.gep.2
store float %val.3, ptr addrspace(1) %out.gep.3
store float %val.4, ptr addrspace(1) %out.gep.4
store float %val.5, ptr addrspace(1) %out.gep.5
store float %val.6, ptr addrspace(1) %out.gep.6
store float %val.7, ptr addrspace(1) %out.gep.7
store float %val.8, ptr addrspace(1) %out.gep.8
store float %val.9, ptr addrspace(1) %out.gep.9
ret void
}

declare void @llvm.amdgcn.wave.barrier()