@@ -237,10 +237,24 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
237
237
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
238
238
; GFX940: ; %bb.0:
239
239
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
240
+ ; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
241
+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
242
+ ; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
243
+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
244
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
245
+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
246
+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
240
247
; GFX940-NEXT: buffer_wbl2 sc1
241
- ; GFX940-NEXT: global_atomic_pk_add_f16 v0 , v[0:1], v2 , off offset:1024 sc0
248
+ ; GFX940-NEXT: global_atomic_cmpswap v3 , v[0:1], v[4:5] , off offset:1024 sc0
242
249
; GFX940-NEXT: s_waitcnt vmcnt(0)
243
250
; GFX940-NEXT: buffer_inv sc1
251
+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
252
+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253
+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
254
+ ; GFX940-NEXT: s_cbranch_execnz .LBB17_1
255
+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
256
+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
257
+ ; GFX940-NEXT: v_mov_b32_e32 v0, v3
244
258
; GFX940-NEXT: s_setpc_b64 s[30:31]
245
259
%gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
246
260
%result = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -251,10 +265,23 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
251
265
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
252
266
; GFX940: ; %bb.0:
253
267
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
268
+ ; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
269
+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
270
+ ; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
271
+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
272
+ ; GFX940-NEXT: s_waitcnt vmcnt(0)
273
+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
254
274
; GFX940-NEXT: buffer_wbl2 sc1
255
- ; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2 , off offset:1024
275
+ ; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5] , off offset:1024 sc0
256
276
; GFX940-NEXT: s_waitcnt vmcnt(0)
257
277
; GFX940-NEXT: buffer_inv sc1
278
+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
279
+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
280
+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
281
+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
282
+ ; GFX940-NEXT: s_cbranch_execnz .LBB18_1
283
+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
284
+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
258
285
; GFX940-NEXT: s_setpc_b64 s[30:31]
259
286
%gep = getelementptr <2 x half >, ptr addrspace (1 ) %ptr , i32 256
260
287
%unused = atomicrmw fadd ptr addrspace (1 ) %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -265,10 +292,24 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
265
292
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
266
293
; GFX940: ; %bb.0:
267
294
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
295
+ ; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
296
+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
297
+ ; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
298
+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
299
+ ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
300
+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
301
+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
268
302
; GFX940-NEXT: buffer_wbl2 sc1
269
- ; GFX940-NEXT: flat_atomic_pk_add_f16 v0 , v[0:1], v2 offset:1024 sc0
303
+ ; GFX940-NEXT: flat_atomic_cmpswap v3 , v[0:1], v[4:5] offset:1024 sc0
270
304
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
271
305
; GFX940-NEXT: buffer_inv sc1
306
+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
307
+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
308
+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
309
+ ; GFX940-NEXT: s_cbranch_execnz .LBB19_1
310
+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
311
+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
312
+ ; GFX940-NEXT: v_mov_b32_e32 v0, v3
272
313
; GFX940-NEXT: s_setpc_b64 s[30:31]
273
314
%gep = getelementptr <2 x half >, ptr %ptr , i32 256
274
315
%result = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
@@ -279,10 +320,23 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
279
320
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
280
321
; GFX940: ; %bb.0:
281
322
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
323
+ ; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
324
+ ; GFX940-NEXT: s_mov_b64 s[0:1], 0
325
+ ; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
326
+ ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
327
+ ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
328
+ ; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
282
329
; GFX940-NEXT: buffer_wbl2 sc1
283
- ; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
330
+ ; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
284
331
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
285
332
; GFX940-NEXT: buffer_inv sc1
333
+ ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
334
+ ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
335
+ ; GFX940-NEXT: v_mov_b32_e32 v5, v3
336
+ ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
337
+ ; GFX940-NEXT: s_cbranch_execnz .LBB20_1
338
+ ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
339
+ ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
286
340
; GFX940-NEXT: s_setpc_b64 s[30:31]
287
341
%gep = getelementptr <2 x half >, ptr %ptr , i32 256
288
342
%unused = atomicrmw fadd ptr %gep , <2 x half > %val syncscope("agent" ) seq_cst
0 commit comments