Skip to content

Commit 1ab1e79

Browse files
committed
AMDGPU: Make areMemAccessesTriviallyDisjoint more aware of segment flat
Checking the encoding is insufficient since now there can be global or scratch instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309472 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 27eee9a commit 1ab1e79

File tree

3 files changed

+98
-37
lines changed

3 files changed

+98
-37
lines changed

lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1997,7 +1997,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
19971997
if (isDS(MIb))
19981998
return checkInstOffsetsDoNotOverlap(MIa, MIb);
19991999

2000-
return !isFLAT(MIb);
2000+
return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
20012001
}
20022002

20032003
if (isMUBUF(MIa) || isMTBUF(MIa)) {

lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,14 @@ class SIInstrInfo final : public AMDGPUInstrInfo {
420420
return MI.getDesc().TSFlags & SIInstrFlags::FLAT;
421421
}
422422

423+
// Is a FLAT encoded instruction which accesses a specific segment,
424+
// i.e. global_* or scratch_*.
425+
static bool isSegmentSpecificFLAT(const MachineInstr &MI) {
426+
auto Flags = MI.getDesc().TSFlags;
427+
return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
428+
}
429+
430+
// Any FLAT encoded instruction, including global_* and scratch_*.
423431
bool isFLAT(uint16_t Opcode) const {
424432
return get(Opcode).TSFlags & SIInstrFlags::FLAT;
425433
}

test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Lines changed: 89 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
1+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
2+
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
23

34
declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
45
declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
@@ -10,9 +11,13 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
1011
@stored_constant_ptr = addrspace(3) global i32 addrspace(2)* undef, align 8
1112
@stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
1213

13-
; FUNC-LABEL: @reorder_local_load_global_store_local_load
14+
; GCN-LABEL: {{^}}reorder_local_load_global_store_local_load:
1415
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
1516
; CI: buffer_store_dword
17+
18+
; GFX9: global_store_dword
19+
; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
20+
; GFX9: global_store_dword
1621
define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
1722
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
1823

@@ -29,10 +34,14 @@ define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrsp
2934
ret void
3035
}
3136

32-
; FUNC-LABEL: @no_reorder_local_load_volatile_global_store_local_load
37+
; GCN-LABEL: {{^}}no_reorder_local_load_volatile_global_store_local_load:
3338
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
3439
; CI: buffer_store_dword
3540
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
41+
42+
; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
43+
; GFX9: global_store_dword
44+
; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
3645
define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
3746
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
3847

@@ -49,10 +58,16 @@ define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_loa
4958
ret void
5059
}
5160

52-
; FUNC-LABEL: @no_reorder_barrier_local_load_global_store_local_load
61+
; GCN-LABEL: {{^}}no_reorder_barrier_local_load_global_store_local_load:
5362
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
5463
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
5564
; CI: buffer_store_dword
65+
66+
; GFX9: global_store_dword
67+
; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
68+
; GFX9: s_barrier
69+
; GFX9: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
70+
; GFX9: global_store_dword
5671
define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
5772
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
5873

@@ -70,13 +85,20 @@ define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load
7085
ret void
7186
}
7287

73-
; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
74-
; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
75-
; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
88+
; GCN-LABEL: {{^}}reorder_constant_load_global_store_constant_load:
89+
; GCN-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
90+
; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
7691
; CI: buffer_store_dword
92+
7793
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
7894
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
95+
96+
; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4
97+
; GFX9: global_store_dword
98+
; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc
99+
79100
; CI: buffer_store_dword
101+
; GFX9: global_store_dword
80102
define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
81103
%ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
82104

@@ -93,13 +115,19 @@ define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32
93115
ret void
94116
}
95117

96-
; FUNC-LABEL: @reorder_constant_load_local_store_constant_load
97-
; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
98-
; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
118+
; GCN-LABEL: {{^}}reorder_constant_load_local_store_constant_load:
119+
; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
120+
; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
121+
99122
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
100123
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
101-
; CI: ds_write_b32
124+
125+
; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x4
126+
; GFX9-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xc
127+
128+
; GCN: ds_write_b32
102129
; CI: buffer_store_dword
130+
; GFX9: global_store_dword
103131
define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
104132
%ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
105133

@@ -116,12 +144,13 @@ define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 a
116144
ret void
117145
}
118146

119-
; FUNC-LABEL: @reorder_smrd_load_local_store_smrd_load
120-
; CI: s_load_dword
121-
; CI: s_load_dword
122-
; CI: s_load_dword
123-
; CI: ds_write_b32
147+
; GCN-LABEL: {{^}}reorder_smrd_load_local_store_smrd_load:
148+
; GCN: s_load_dword
149+
; GCN: s_load_dword
150+
; GCN: s_load_dword
151+
; GCN: ds_write_b32
124152
; CI: buffer_store_dword
153+
; GFX9: global_store_dword
125154
define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
126155
%ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
127156
%ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
@@ -136,11 +165,15 @@ define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace
136165
ret void
137166
}
138167

139-
; FUNC-LABEL: @reorder_global_load_local_store_global_load
168+
; GCN-LABEL: {{^}}reorder_global_load_local_store_global_load:
140169
; CI: ds_write_b32
141170
; CI: buffer_load_dword
142171
; CI: buffer_load_dword
143172
; CI: buffer_store_dword
173+
174+
; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
175+
; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
176+
; GFX9: ds_write_b32
144177
define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
145178
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
146179
%ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
@@ -155,12 +188,13 @@ define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrs
155188
ret void
156189
}
157190

158-
; FUNC-LABEL: @reorder_local_offsets
159-
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
160-
; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
161-
; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
191+
; GCN-LABEL: {{^}}reorder_local_offsets:
192+
; GCN: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
193+
; GCN-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
194+
; GCN-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
162195
; CI: buffer_store_dword
163-
; CI: s_endpgm
196+
; GFX9: global_store_dword
197+
; GCN: s_endpgm
164198
define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
165199
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
166200
%ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
@@ -179,14 +213,22 @@ define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %ou
179213
ret void
180214
}
181215

182-
; FUNC-LABEL: @reorder_global_offsets
216+
; GCN-LABEL: {{^}}reorder_global_offsets:
183217
; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
184218
; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
185219
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
186220
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
187221
; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
188222
; CI: buffer_store_dword
189223
; CI: s_endpgm
224+
225+
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
226+
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
227+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
228+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
229+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
230+
; GFX9: global_store_dword
231+
; GFX9: s_endpgm
190232
define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
191233
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
192234
%ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
@@ -205,22 +247,33 @@ define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %o
205247
ret void
206248
}
207249

208-
; FUNC-LABEL: {{^}}reorder_global_offsets_addr64_soffset0:
209-
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:12{{$}}
210-
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:28{{$}}
211-
; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:44{{$}}
250+
; GCN-LABEL: {{^}}reorder_global_offsets_addr64_soffset0:
251+
; CI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
252+
; CI-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:28{{$}}
253+
; CI-NEXT: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:44{{$}}
254+
255+
; CI: v_mov_b32
256+
; CI: v_mov_b32
257+
258+
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
259+
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
260+
261+
; CI: v_add_i32
262+
; CI: v_add_i32
263+
264+
; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
265+
; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
212266

213-
; GCN: v_mov_b32
214-
; GCN: v_mov_b32
215267

216-
; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64{{$}}
217-
; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:20{{$}}
268+
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
269+
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28
270+
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44
218271

219-
; GCN: v_add_i32
220-
; GCN: v_add_i32
272+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}}
273+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20
221274

222-
; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
223-
; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
275+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
276+
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
224277
define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
225278
%id = call i32 @llvm.amdgcn.workitem.id.x()
226279
%id.ext = sext i32 %id to i64
@@ -245,7 +298,7 @@ define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(
245298
ret void
246299
}
247300

248-
; XFUNC-LABEL: @reorder_local_load_tbuffer_store_local_load
301+
; XGCN-LABEL: {{^}}reorder_local_load_tbuffer_store_local_load:
249302
; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4
250303
; XCI: TBUFFER_STORE_FORMAT
251304
; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8

0 commit comments

Comments
 (0)