Skip to content

Commit a6dabed

Browse files
authored
[AMDGPU] Fix nondeterminism in SIFixSGPRCopies (#70644)
There are a couple of loops that iterate over V2SCopies. The iteration order needs to be deterministic, otherwise we can call moveToVALU in different orders, which causes temporary vregs to be allocated in different orders, which can affect register allocation heuristics.
1 parent deb5bd1 commit a6dabed

6 files changed

+72
-18
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
125125
SmallVector<MachineInstr*, 4> PHINodes;
126126
SmallVector<MachineInstr*, 4> S2VCopies;
127127
unsigned NextVGPRToSGPRCopyID;
128-
DenseMap<unsigned, V2SCopyInfo> V2SCopies;
128+
MapVector<unsigned, V2SCopyInfo> V2SCopies;
129129
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
130130

131131
public:
@@ -988,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
988988
for (auto J : Info->Siblings) {
989989
auto InfoIt = V2SCopies.find(J);
990990
if (InfoIt != V2SCopies.end()) {
991-
MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
991+
MachineInstr *SiblingCopy = InfoIt->second.Copy;
992992
if (SiblingCopy->isImplicitDef())
993993
// the COPY has already been MoveToVALUed
994994
continue;
@@ -1023,12 +1023,12 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10231023
unsigned CurID = LoweringWorklist.pop_back_val();
10241024
auto CurInfoIt = V2SCopies.find(CurID);
10251025
if (CurInfoIt != V2SCopies.end()) {
1026-
V2SCopyInfo C = CurInfoIt->getSecond();
1026+
V2SCopyInfo C = CurInfoIt->second;
10271027
LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
10281028
for (auto S : C.Siblings) {
10291029
auto SibInfoIt = V2SCopies.find(S);
10301030
if (SibInfoIt != V2SCopies.end()) {
1031-
V2SCopyInfo &SI = SibInfoIt->getSecond();
1031+
V2SCopyInfo &SI = SibInfoIt->second;
10321032
LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
10331033
if (!SI.NeedToBeConvertedToVALU) {
10341034
SI.SChain.set_subtract(C.SChain);
@@ -1040,6 +1040,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
10401040
}
10411041
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
10421042
<< " is being turned to VALU\n");
1043+
// TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
1044+
// instead.
10431045
V2SCopies.erase(C.ID);
10441046
Copies.insert(C.Copy);
10451047
}

llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -948,12 +948,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
948948
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
949949
; GFX90A-NEXT: {{ $}}
950950
; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
951-
; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
952-
; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec
951+
; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
952+
; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec
953953
; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
954-
; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
955-
; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec
956-
; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec
954+
; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
955+
; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec
956+
; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec
957957
; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
958958
; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1
959959
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
@@ -975,20 +975,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
975975
; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
976976
; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
977977
; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
978-
; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
978+
; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
979979
; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
980+
; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
980981
; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
981-
; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
982982
; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
983983
; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
984984
; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
985985
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
986986
; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
987-
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
987+
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
988988
; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
989989
; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
990-
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
991-
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
990+
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
991+
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
992992
; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
993993
; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
994994
; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)

llvm/test/CodeGen/AMDGPU/carryout-selection.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1965,7 +1965,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
19651965
; VI-NEXT: v_mul_lo_u32 v2, s8, v4
19661966
; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
19671967
; VI-NEXT: v_mul_lo_u32 v3, s9, v5
1968-
; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1
1968+
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2
19691969
; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3
19701970
; VI-NEXT: v_mul_hi_u32 v6, v5, v0
19711971
; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
3+
4+
define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
5+
; CHECK-LABEL: f:
6+
; CHECK: ; %bb.0: ; %bb
7+
; CHECK-NEXT: s_cmp_eq_u32 s0, 0
8+
; CHECK-NEXT: s_mov_b32 s0, 0
9+
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
10+
; CHECK-NEXT: ; %bb.1: ; %bb3
11+
; CHECK-NEXT: v_mov_b32_e32 v5, v0
12+
; CHECK-NEXT: s_branch .LBB0_3
13+
; CHECK-NEXT: .LBB0_2:
14+
; CHECK-NEXT: v_mov_b32_e32 v5, 1
15+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
16+
; CHECK-NEXT: .LBB0_3: ; %bb4
17+
; CHECK-NEXT: v_mov_b32_e32 v6, 0
18+
; CHECK-NEXT: s_mov_b32 s1, s0
19+
; CHECK-NEXT: s_mov_b32 s2, s0
20+
; CHECK-NEXT: s_mov_b32 s3, s0
21+
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
22+
; CHECK-NEXT: v_mov_b32_e32 v7, v6
23+
; CHECK-NEXT: v_mov_b32_e32 v8, v6
24+
; CHECK-NEXT: v_mov_b32_e32 v2, v6
25+
; CHECK-NEXT: v_mov_b32_e32 v3, v6
26+
; CHECK-NEXT: v_mov_b32_e32 v4, v6
27+
; CHECK-NEXT: s_clause 0x1
28+
; CHECK-NEXT: buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen
29+
; CHECK-NEXT: buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen
30+
; CHECK-NEXT: s_nop 0
31+
; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
32+
; CHECK-NEXT: s_endpgm
33+
bb:
34+
%i = icmp eq i32 %arg, 0
35+
br i1 %i, label %bb4, label %bb3
36+
37+
bb3:
38+
br label %bb4
39+
40+
bb4:
41+
%i5 = phi i32 [ %arg1, %bb3 ], [ 1, %bb ]
42+
%i6 = phi i32 [ %arg2, %bb3 ], [ 0, %bb ]
43+
%i7 = insertelement <4 x i32> zeroinitializer, i32 %i5, i64 0
44+
%i8 = bitcast <4 x i32> %i7 to <4 x float>
45+
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i8, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0)
46+
%i9 = insertelement <4 x i32> zeroinitializer, i32 %i6, i64 0
47+
%i10 = bitcast <4 x i32> %i9 to <4 x float>
48+
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i10, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0)
49+
ret void
50+
}
51+
52+
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg)

llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
1616
; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
1717
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
1818
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
19-
; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v2, v3
19+
; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2
2020
; NOSDWA-NEXT: flat_store_dword v[0:1], v2
2121
; NOSDWA-NEXT: s_endpgm
2222
;
@@ -30,7 +30,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
3030
; GFX89-NEXT: v_mov_b32_e32 v0, s0
3131
; GFX89-NEXT: v_mov_b32_e32 v1, s1
3232
; GFX89-NEXT: s_waitcnt vmcnt(0)
33-
; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
33+
; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
3434
; GFX89-NEXT: flat_store_dword v[0:1], v2
3535
; GFX89-NEXT: s_endpgm
3636
;

llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
509509
; SI-NEXT: bb.6.sw.bb18:
510510
; SI-NEXT: successors: %bb.5(0x80000000)
511511
; SI-NEXT: {{ $}}
512-
; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
512+
; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
513513
; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
514514
; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
515515
; SI-NEXT: S_BRANCH %bb.5

0 commit comments

Comments
 (0)