Skip to content

Commit d4e46f0

Browse files
authored
[AMDGPU] Fix machine verification failure from INIT_EXEC lowering (#98333)
Fix machine verification failure from INIT_EXEC lowering since it was moved from SILowerControlFlow to SIWholeQuadMode in #94452.
1 parent ce92b2f commit d4e46f0

File tree

2 files changed

+78
-1
lines changed

2 files changed

+78
-1
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1676,6 +1676,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
16761676
if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
16771677
LowerToMovInstrs.empty() && KillInstrs.empty()) {
16781678
lowerLiveMaskQueries();
1679+
if (!InitExecInstrs.empty())
1680+
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
16791681
return !InitExecInstrs.empty() || !LiveMaskQueries.empty();
16801682
}
16811683

@@ -1717,7 +1719,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
17171719
LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
17181720

17191721
// If we performed any kills then recompute EXEC
1720-
if (!KillInstrs.empty())
1722+
if (!KillInstrs.empty() || !InitExecInstrs.empty())
17211723
LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
17221724

17231725
return true;

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3463,6 +3463,81 @@ bb:
34633463
ret void
34643464
}
34653465

3466+
; Test a case that failed machine verification.
3467+
define amdgpu_gs void @wqm_init_exec_switch(i32 %arg) {
3468+
; GFX9-W64-LABEL: wqm_init_exec_switch:
3469+
; GFX9-W64: ; %bb.0:
3470+
; GFX9-W64-NEXT: s_mov_b64 exec, 0
3471+
; GFX9-W64-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
3472+
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
3473+
; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3474+
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
3475+
; GFX9-W64-NEXT: s_endpgm
3476+
;
3477+
; GFX10-W32-LABEL: wqm_init_exec_switch:
3478+
; GFX10-W32: ; %bb.0:
3479+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3480+
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
3481+
; GFX10-W32-NEXT: v_cmpx_lt_i32_e32 0, v0
3482+
; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
3483+
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
3484+
; GFX10-W32-NEXT: s_endpgm
3485+
call void @llvm.amdgcn.init.exec(i64 0)
3486+
switch i32 %arg, label %bb1 [
3487+
i32 0, label %bb3
3488+
i32 1, label %bb2
3489+
]
3490+
bb1:
3491+
ret void
3492+
bb2:
3493+
ret void
3494+
bb3:
3495+
ret void
3496+
}
3497+
3498+
define amdgpu_gs void @wqm_init_exec_wwm() {
3499+
; GFX9-W64-LABEL: wqm_init_exec_wwm:
3500+
; GFX9-W64: ; %bb.0:
3501+
; GFX9-W64-NEXT: s_mov_b64 exec, 0
3502+
; GFX9-W64-NEXT: s_mov_b32 s1, 0
3503+
; GFX9-W64-NEXT: s_mov_b32 s0, s1
3504+
; GFX9-W64-NEXT: s_cmp_lg_u64 exec, 0
3505+
; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0
3506+
; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0
3507+
; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0
3508+
; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
3509+
; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
3510+
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
3511+
; GFX9-W64-NEXT: exp mrt0 off, off, off, off
3512+
; GFX9-W64-NEXT: s_endpgm
3513+
;
3514+
; GFX10-W32-LABEL: wqm_init_exec_wwm:
3515+
; GFX10-W32: ; %bb.0:
3516+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3517+
; GFX10-W32-NEXT: s_mov_b32 s1, 0
3518+
; GFX10-W32-NEXT: s_cmp_lg_u64 exec, 0
3519+
; GFX10-W32-NEXT: s_mov_b32 s0, s1
3520+
; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0
3521+
; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0
3522+
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
3523+
; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0
3524+
; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0
3525+
; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
3526+
; GFX10-W32-NEXT: exp mrt0 off, off, off, off
3527+
; GFX10-W32-NEXT: s_endpgm
3528+
call void @llvm.amdgcn.init.exec(i64 0)
3529+
%i = call i64 @llvm.amdgcn.ballot.i64(i1 true)
3530+
%i1 = call i32 @llvm.amdgcn.wwm.i32(i32 0)
3531+
%i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 0
3532+
%i3 = bitcast <2 x i32> %i2 to i64
3533+
%i4 = icmp ne i64 %i, 0
3534+
%i5 = icmp ne i64 %i3, 0
3535+
%i6 = xor i1 %i4, %i5
3536+
%i7 = uitofp i1 %i6 to float
3537+
call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %i7, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
3538+
ret void
3539+
}
3540+
34663541
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
34673542
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
34683543

0 commit comments

Comments
 (0)