diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp index 1d9408d6db433..9a1978c7eef2c 100644 --- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp +++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp @@ -134,10 +134,6 @@ class FlattenCFGOpt { /// its predecessor. In Case 2, BB (BB3) only has conditional branches /// as its predecessors. bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { - PHINode *PHI = dyn_cast(BB->begin()); - if (PHI) - return false; // For simplicity, avoid cases containing PHI nodes. - BasicBlock *LastCondBlock = nullptr; BasicBlock *FirstCondBlock = nullptr; BasicBlock *UnCondBlock = nullptr; @@ -208,8 +204,12 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { if (Idx == -1) Idx = CIdx; - else if (CIdx != Idx) - return false; + else if (CIdx != Idx) { + // Inverse Branch Condition + IRBuilder<>::InsertPointGuard Guard(Builder); + Builder.SetInsertPoint(PBI); + InvertBranch(PBI, Builder); + } // PS is the successor which is not BB. Check successors to identify // the last conditional branch. @@ -269,11 +269,6 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { if (!PBI1 || !PBI1->isUnconditional()) return false; - // PS2 should not contain PHI node. - PHI = dyn_cast(PS2->begin()); - if (PHI) - return false; - // Do the transformation. BasicBlock *CB; BranchInst *PBI = cast(FirstCondBlock->getTerminator()); @@ -291,17 +286,45 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) { // Merge conditions. Builder.SetInsertPoint(PBI); Value *NC; - if (Idx == 0) - // Case 2, use parallel or. - NC = Builder.CreateOr(PC, CC); - else + if (UnCondBlock) // Case 1, use parallel and. NC = Builder.CreateAnd(PC, CC); + else + // Case 2, use parallel or. + NC = Builder.CreateOr(PC, CC); + + // Fixup PHI node if needed + for (BasicBlock *CBS : successors(PBI)) { + for (PHINode &Phi : CBS->phis()) { + Value *origPhi0 = nullptr; + Value *newPhi = nullptr; + if (llvm::is_contained(Phi.blocks(), FirstCondBlock)) { + origPhi0 = Phi.removeIncomingValue(FirstCondBlock, false); + newPhi = origPhi0; + } + if (llvm::is_contained(Phi.blocks(), CB)) { + Value *origPhi1 = Phi.removeIncomingValue(CB, false); + newPhi = origPhi1; + + if (origPhi0) { + // Swap branch given the conditions + if (PBI->getSuccessor(0) == CBS) { + newPhi = Builder.CreateSelect(PC, origPhi0, origPhi1); + } else { + newPhi = Builder.CreateSelect(PC, origPhi1, origPhi0); + } + } + } + if (newPhi) + Phi.addIncoming(newPhi, FirstCondBlock); + } + } PBI->replaceUsesOfWith(CC, NC); PC = NC; if (CB == LastCondBlock) Iteration = false; + // Remove internal conditional branches. CB->dropAllReferences(); // make CB unreachable and let downstream to delete the block. diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll index 9c2fabce4bcde..7dcf3dd620030 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll @@ -480,29 +480,58 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX906-LABEL: v8i8_phi_chain: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX906-NEXT: v_lshlrev_b32_e32 v3, 3, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 -; GFX906-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX906-NEXT: v_cmp_le_u32_e32 vcc, 15, v0 +; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v0 +; GFX906-NEXT: s_or_b64 s[2:3], vcc, s[0:1] ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[8:9] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; GFX906-NEXT: s_cbranch_execz .LBB8_2 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v3, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX906-NEXT: s_and_b64 s[4:5], exec, vcc -; GFX906-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX906-NEXT: .LBB8_2: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX906-NEXT: s_cbranch_execz .LBB8_4 -; GFX906-NEXT: ; %bb.3: ; %bb.2 -; GFX906-NEXT: v_mov_b32_e32 v0, 0 +; GFX906-NEXT: ; %bb.1: ; %bb.2 +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 24, v4 ; GFX906-NEXT: s_waitcnt vmcnt(0) +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v13, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v15, 24, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX906-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX906-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX906-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v4, v13, v7, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v7, v15, v9, vcc +; GFX906-NEXT: v_mov_b32_e32 v9, 8 +; GFX906-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v3, v11, v5, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v5, v12, v6, vcc +; GFX906-NEXT: v_cndmask_b32_e32 v6, v14, v8, vcc +; GFX906-NEXT: v_mov_b32_e32 v8, 0xff +; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v1, v8, v0 +; GFX906-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v5 +; GFX906-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX906-NEXT: v_or3_b32 v1, v0, v1, v3 +; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v2, v8, v0 +; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v6 +; GFX906-NEXT: v_and_b32_e32 v3, 0xff, v7 +; GFX906-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX906-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX906-NEXT: v_or3_b32 v2, v0, v2, v3 +; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] -; GFX906-NEXT: .LBB8_4: ; %bb.3 -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX906-NEXT: .LBB8_2: ; %bb.3 +; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] @@ -535,29 +564,50 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX906-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0 +; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 +; GFX906-NEXT: v_cmp_gt_u32_e64 s[0:1], 15, v0 +; GFX906-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[8:9] +; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[8:9] +; GFX906-NEXT: global_load_dwordx2 v[3:4], v5, s[10:11] +; GFX906-NEXT: s_mov_b64 vcc, s[0:1] +; GFX906-NEXT: v_mov_b32_e32 v6, 8 +; GFX906-NEXT: v_mov_b32_e32 v5, 0xff +; GFX906-NEXT: s_waitcnt vmcnt(1) +; GFX906-NEXT: v_lshrrev_b32_e32 v7, 8, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: v_mov_b32_e32 v1, v3 -; GFX906-NEXT: v_mov_b32_e32 v2, v4 -; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_4 -; GFX906-NEXT: ; %bb.1: ; %bb.1 -; GFX906-NEXT: global_load_dwordx2 v[1:2], v5, s[10:11] -; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 7, v0 -; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX906-NEXT: s_cbranch_execz .LBB9_3 -; GFX906-NEXT: ; %bb.2: ; %bb.2 +; GFX906-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX906-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX906-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX906-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[0:1] +; GFX906-NEXT: v_cndmask_b32_sdwa v9, v1, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX906-NEXT: v_cndmask_b32_e64 v0, v1, v3, s[0:1] +; GFX906-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[0:1] +; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_b32_e32 v9, 0xff, v9 +; GFX906-NEXT: v_cndmask_b32_e64 v11, v2, v4, s[0:1] +; GFX906-NEXT: v_cndmask_b32_sdwa v10, v1, v3, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX906-NEXT: v_lshlrev_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX906-NEXT: v_and_or_b32 v0, v0, v5, v7 +; GFX906-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX906-NEXT: v_cndmask_b32_sdwa v8, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX906-NEXT: v_and_or_b32 v6, v11, v5, v6 +; GFX906-NEXT: v_or3_b32 v5, v0, v7, v10 +; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v8 +; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX906-NEXT: v_cndmask_b32_sdwa v7, v2, v4, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 +; GFX906-NEXT: v_or3_b32 v6, v6, v0, v7 +; GFX906-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; GFX906-NEXT: s_cbranch_execz .LBB9_2 +; GFX906-NEXT: ; %bb.1: ; %bb.2 +; GFX906-NEXT: v_mov_b32_e32 v6, v4 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: global_store_dwordx2 v0, v[3:4], s[12:13] -; GFX906-NEXT: .LBB9_3: ; %Flow -; GFX906-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX906-NEXT: .LBB9_4: ; %bb.3 +; GFX906-NEXT: v_mov_b32_e32 v5, v3 +; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[12:13] +; GFX906-NEXT: .LBB9_2: ; %bb.3 ; GFX906-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX906-NEXT: v_mov_b32_e32 v0, 0 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: global_store_dwordx2 v0, v[1:2], s[14:15] +; GFX906-NEXT: global_store_dwordx2 v0, v[5:6], s[14:15] ; GFX906-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll index ad0d6d8016ad6..19f3705c97d4b 100644 --- a/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll +++ b/llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -12,73 +12,59 @@ define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %ext ; CHECK-NEXT: s_load_dwordx8 s[48:55], s[8:9], 0x0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: s_mov_b32 s12, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_cmp_lg_u32 s52, 0 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_9 -; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i -; CHECK-NEXT: s_cmp_eq_u32 s54, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 -; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i -; CHECK-NEXT: s_cmp_lg_u32 s55, 0 -; CHECK-NEXT: s_mov_b32 s17, 0 -; CHECK-NEXT: s_cselect_b32 s12, -1, 0 -; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s12 -; CHECK-NEXT: s_cbranch_vccz .LBB0_5 -; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_mov_b32 s18, 0 -; CHECK-NEXT: s_branch .LBB0_6 -; CHECK-NEXT: .LBB0_4: -; CHECK-NEXT: s_mov_b32 s14, s12 -; CHECK-NEXT: s_mov_b32 s15, s12 -; CHECK-NEXT: s_mov_b32 s13, s12 -; CHECK-NEXT: s_mov_b64 s[50:51], s[14:15] -; CHECK-NEXT: s_mov_b64 s[48:49], s[12:13] -; CHECK-NEXT: s_branch .LBB0_8 -; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i +; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i ; CHECK-NEXT: v_cmp_lt_f32_e64 s12, s53, 0 -; CHECK-NEXT: s_mov_b32 s18, 1.0 -; CHECK-NEXT: s_mov_b32 s17, 0x7fc00000 -; CHECK-NEXT: .LBB0_6: ; %Flow -; CHECK-NEXT: s_mov_b32 s48, 1.0 -; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 +; CHECK-NEXT: s_cmp_lg_u32 s55, 0 +; CHECK-NEXT: s_cselect_b32 s17, -1, 0 +; CHECK-NEXT: s_or_b32 s12, s17, s12 +; CHECK-NEXT: s_cmp_lg_u32 s54, 0 +; CHECK-NEXT: s_cselect_b32 s13, -1, 0 +; CHECK-NEXT: s_and_b32 s18, s13, exec_lo +; CHECK-NEXT: s_cselect_b32 s48, 1.0, 0 +; CHECK-NEXT: s_and_b32 s12, s13, s12 ; CHECK-NEXT: s_mov_b32 s49, s48 ; CHECK-NEXT: s_mov_b32 s50, s48 +; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s12 ; CHECK-NEXT: s_mov_b32 s51, s48 -; CHECK-NEXT: s_cbranch_vccnz .LBB0_8 -; CHECK-NEXT: ; %bb.7: ; %if.end273.i.i +; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 +; CHECK-NEXT: ; %bb.2: ; %if.end273.i.i ; CHECK-NEXT: s_add_u32 s12, s8, 40 ; CHECK-NEXT: s_addc_u32 s13, s9, 0 -; CHECK-NEXT: s_getpc_b64 s[20:21] -; CHECK-NEXT: s_add_u32 s20, s20, _Z3dotDv3_fS_@gotpcrel32@lo+4 -; CHECK-NEXT: s_addc_u32 s21, s21, _Z3dotDv3_fS_@gotpcrel32@hi+12 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, _Z3dotDv3_fS_@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, _Z3dotDv3_fS_@gotpcrel32@hi+12 +; CHECK-NEXT: v_cndmask_b32_e64 v3, 1.0, 0, s17 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0x7fc00000, 0, s17 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 -; CHECK-NEXT: s_load_dwordx2 s[20:21], s[20:21], 0x0 -; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1 -; CHECK-NEXT: v_add_f32_e64 v1, s17, s18 +; CHECK-NEXT: v_lshlrev_b32_e32 v5, 10, v1 ; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] ; CHECK-NEXT: s_mov_b64 s[8:9], s[12:13] +; CHECK-NEXT: v_add_f32_e32 v1, v4, v3 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_or3_b32 v31, v0, v3, v2 -; CHECK-NEXT: v_mov_b32_e32 v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_or3_b32 v31, v0, v5, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_mov_b32 s14, s16 ; CHECK-NEXT: s_mov_b32 s48, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_swappc_b64 s[30:31], s[20:21] +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] ; CHECK-NEXT: s_mov_b64 s[8:9], s[34:35] ; CHECK-NEXT: s_mov_b32 s49, s48 ; CHECK-NEXT: s_mov_b32 s50, s48 ; CHECK-NEXT: s_mov_b32 s51, s48 -; CHECK-NEXT: .LBB0_8: ; %if.end294.i.i +; CHECK-NEXT: .LBB0_3: ; %if.end294.i.i ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; CHECK-NEXT: .LBB0_9: ; %kernel_direct_lighting.exit +; CHECK-NEXT: .LBB0_4: ; %kernel_direct_lighting.exit ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x20 ; CHECK-NEXT: v_mov_b32_e32 v0, s48 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll index 827cb4ac2589a..98aa1f0849eec 100644 --- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll +++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll @@ -20,50 +20,48 @@ define amdgpu_ps void @main(i32 %0, float %1) { ; ISA: ; %bb.0: ; %start ; ISA-NEXT: v_readfirstlane_b32 s0, v0 ; ISA-NEXT: s_mov_b32 m0, s0 -; ISA-NEXT: s_mov_b32 s10, 0 +; ISA-NEXT: s_mov_b64 s[4:5], 0 ; ISA-NEXT: v_interp_p1_f32_e32 v0, v1, attr0.x ; ISA-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; ISA-NEXT: s_mov_b64 s[0:1], 0 -; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5 -; ISA-NEXT: ; implicit-def: $sgpr2_sgpr3 -; ISA-NEXT: s_branch .LBB0_3 -; ISA-NEXT: .LBB0_1: ; %Flow1 -; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_or_b64 exec, exec, s[4:5] -; ISA-NEXT: s_mov_b64 s[8:9], 0 -; ISA-NEXT: s_mov_b64 s[4:5], s[6:7] -; ISA-NEXT: .LBB0_2: ; %Flow -; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_and_b64 s[6:7], exec, s[4:5] -; ISA-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] -; ISA-NEXT: s_andn2_b64 s[2:3], s[2:3], exec -; ISA-NEXT: s_and_b64 s[6:7], s[8:9], exec -; ISA-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; ISA-NEXT: s_andn2_b64 exec, exec, s[0:1] -; ISA-NEXT: s_cbranch_execz .LBB0_6 -; ISA-NEXT: .LBB0_3: ; %loop +; ISA-NEXT: v_mov_b32_e32 v1, 0 +; ISA-NEXT: ; implicit-def: $sgpr10_sgpr11 +; ISA-NEXT: ; implicit-def: $sgpr8_sgpr9 +; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; ISA-NEXT: s_branch .LBB0_2 +; ISA-NEXT: .LBB0_1: ; %Flow +; ISA-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: s_and_b64 s[0:1], exec, s[8:9] +; ISA-NEXT: s_or_b64 s[4:5], s[0:1], s[4:5] +; ISA-NEXT: s_andn2_b64 s[0:1], s[6:7], exec +; ISA-NEXT: s_and_b64 s[2:3], s[10:11], exec +; ISA-NEXT: s_or_b64 s[6:7], s[0:1], s[2:3] +; ISA-NEXT: s_andn2_b64 exec, exec, s[4:5] +; ISA-NEXT: s_cbranch_execz .LBB0_4 +; ISA-NEXT: .LBB0_2: ; %loop ; ISA-NEXT: ; =>This Inner Loop Header: Depth=1 -; ISA-NEXT: s_or_b64 s[4:5], s[4:5], exec -; ISA-NEXT: s_mov_b64 s[6:7], -1 -; ISA-NEXT: s_cmp_lt_u32 s10, 32 -; ISA-NEXT: s_mov_b64 s[8:9], -1 -; ISA-NEXT: s_cbranch_scc0 .LBB0_2 -; ISA-NEXT: ; %bb.4: ; %endif1 -; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_and_saveexec_b64 s[4:5], vcc +; ISA-NEXT: v_cmp_lt_u32_e64 s[0:1], 31, v1 +; ISA-NEXT: v_cmp_gt_u32_e64 s[2:3], 32, v1 +; ISA-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; ISA-NEXT: s_and_b64 s[0:1], s[0:1], exec +; ISA-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; ISA-NEXT: s_or_b64 s[8:9], s[8:9], exec +; ISA-NEXT: s_or_b64 s[10:11], s[10:11], s[0:1] +; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; ISA-NEXT: s_cbranch_execz .LBB0_1 -; ISA-NEXT: ; %bb.5: ; %endif2 -; ISA-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; ISA-NEXT: s_add_i32 s10, s10, 1 -; ISA-NEXT: s_xor_b64 s[6:7], exec, -1 +; ISA-NEXT: ; %bb.3: ; %endif2 +; ISA-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; ISA-NEXT: v_add_u32_e32 v1, 1, v1 +; ISA-NEXT: s_andn2_b64 s[8:9], s[8:9], exec +; ISA-NEXT: s_andn2_b64 s[10:11], s[10:11], exec ; ISA-NEXT: s_branch .LBB0_1 -; ISA-NEXT: .LBB0_6: ; %Flow2 -; ISA-NEXT: s_or_b64 exec, exec, s[0:1] +; ISA-NEXT: .LBB0_4: ; %Flow2 +; ISA-NEXT: s_or_b64 exec, exec, s[4:5] ; ISA-NEXT: v_mov_b32_e32 v1, 0 -; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] -; ISA-NEXT: ; %bb.7: ; %if1 +; ISA-NEXT: s_and_saveexec_b64 s[0:1], s[6:7] +; ISA-NEXT: ; %bb.5: ; %if1 ; ISA-NEXT: v_sqrt_f32_e32 v1, v0 -; ISA-NEXT: ; %bb.8: ; %endloop +; ISA-NEXT: ; %bb.6: ; %endloop ; ISA-NEXT: s_or_b64 exec, exec, s[0:1] ; ISA-NEXT: exp mrt0 v1, v1, v1, v1 done vm ; ISA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/jump-address.ll b/llvm/test/CodeGen/AMDGPU/jump-address.ll index d58db378e1384..557536aa45483 100644 --- a/llvm/test/CodeGen/AMDGPU/jump-address.ll +++ b/llvm/test/CodeGen/AMDGPU/jump-address.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck %s -; CHECK: JUMP @6 +; CHECK: JUMP @3 ; CHECK: EXPORT ; CHECK-NOT: EXPORT diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll index 6a23875c18241..c5ef622d3aaf8 100644 --- a/llvm/test/CodeGen/AMDGPU/predicates.ll +++ b/llvm/test/CodeGen/AMDGPU/predicates.ll @@ -45,10 +45,8 @@ ENDIF: } ; CHECK-LABEL: {{^}}nested_if: -; CHECK: ALU_PUSH_BEFORE -; CHECK: JUMP -; CHECK: POP -; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec +; CHECK: ALU +; CHECK: CNDGT_INT ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred, ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel define amdgpu_kernel void @nested_if(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll index a401f989a2507..b0ceae03b013c 100644 --- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll +++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll @@ -391,30 +391,25 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_phi_chain: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v2, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v2 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v2 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v2 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4 +; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; GFX942-NEXT: s_cbranch_execz .LBB8_2 -; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v2 -; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: .LBB8_2: ; %Flow -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB8_4 -; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: ; %bb.1: ; %bb.2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] -; GFX942-NEXT: .LBB8_4: ; %bb.3 -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: .LBB8_2: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(1) ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] @@ -449,36 +444,26 @@ define amdgpu_kernel void @v8i8_phi_zeroinit(ptr addrspace(1) %src1, ptr addrspa ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 ; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v4 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v4 +; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4 +; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] -; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; GFX942-NEXT: s_cbranch_execz .LBB9_2 -; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 -; GFX942-NEXT: s_waitcnt vmcnt(1) -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: v_mov_b32_e32 v1, v0 -; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX942-NEXT: .LBB9_2: ; %Flow -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB9_4 -; GFX942-NEXT: ; %bb.3: ; %bb.2 +; GFX942-NEXT: ; %bb.1: ; %bb.2 ; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX942-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc ; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] -; GFX942-NEXT: v_mov_b64_e32 v[2:3], v[0:1] -; GFX942-NEXT: .LBB9_4: ; %bb.3 -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX942-NEXT: .LBB9_2: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[14:15] +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -507,88 +492,30 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace( ; GFX942-LABEL: v8i8_phi_const: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v16, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v3, 3, v16 -; GFX942-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v16 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v16 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX942-NEXT: v_cmp_lt_u32_e32 vcc, 14, v4 +; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 7, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[0:1], v3, s[8:9] -; GFX942-NEXT: ; implicit-def: $vgpr2 -; GFX942-NEXT: ; implicit-def: $vgpr12 -; GFX942-NEXT: ; implicit-def: $vgpr10 -; GFX942-NEXT: ; implicit-def: $vgpr13 -; GFX942-NEXT: ; implicit-def: $vgpr14 -; GFX942-NEXT: ; implicit-def: $vgpr11 -; GFX942-NEXT: ; implicit-def: $vgpr15 -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX942-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v8, 16, v0 -; GFX942-NEXT: v_lshrrev_b32_e32 v9, 8, v0 -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[10:11] +; GFX942-NEXT: s_or_b64 s[2:3], vcc, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] ; GFX942-NEXT: s_cbranch_execz .LBB10_2 -; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[2:3], v3, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v16 -; GFX942-NEXT: s_andn2_b64 s[0:1], s[0:1], exec -; GFX942-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX942-NEXT: v_mov_b32_e32 v4, 8 -; GFX942-NEXT: v_mov_b32_e32 v5, 7 -; GFX942-NEXT: v_mov_b32_e32 v6, 6 -; GFX942-NEXT: v_mov_b32_e32 v1, 5 -; GFX942-NEXT: v_mov_b32_e32 v7, 4 -; GFX942-NEXT: v_mov_b32_e32 v8, 3 -; GFX942-NEXT: v_mov_b32_e32 v9, 2 -; GFX942-NEXT: v_mov_b32_e32 v0, 1 -; GFX942-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; GFX942-NEXT: ; %bb.1: ; %bb.2 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v11, 16, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v14, 8, v3 -; GFX942-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX942-NEXT: v_lshrrev_b32_e32 v12, 8, v2 -; GFX942-NEXT: .LBB10_2: ; %Flow -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GFX942-NEXT: s_cbranch_execz .LBB10_4 -; GFX942-NEXT: ; %bb.3: ; %bb.2 -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v9 -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v7 -; GFX942-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v11, 8, v4 -; GFX942-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX942-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v11, v5, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_mov_b32_e32 v10, 0 -; GFX942-NEXT: v_or_b32_sdwa v3, v3, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: global_store_dwordx2 v10, v[2:3], s[12:13] -; GFX942-NEXT: v_mov_b32_e32 v2, v0 -; GFX942-NEXT: v_mov_b32_e32 v12, v9 -; GFX942-NEXT: v_mov_b32_e32 v10, v8 -; GFX942-NEXT: v_mov_b32_e32 v13, v7 -; GFX942-NEXT: v_mov_b32_e32 v3, v1 -; GFX942-NEXT: v_mov_b32_e32 v14, v6 -; GFX942-NEXT: v_mov_b32_e32 v11, v5 -; GFX942-NEXT: v_mov_b32_e32 v15, v4 -; GFX942-NEXT: .LBB10_4: ; %bb.3 -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: v_lshlrev_b16_e32 v0, 8, v12 -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v13 -; GFX942-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v2, 8, v15 -; GFX942-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: v_lshlrev_b16_e32 v1, 8, v14 -; GFX942-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: v_mov_b32_e32 v0, 0x8070605 +; GFX942-NEXT: v_cndmask_b32_e32 v1, v0, v3, vcc +; GFX942-NEXT: v_mov_b32_e32 v0, 0x4030201 +; GFX942-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[12:13] +; GFX942-NEXT: .LBB10_2: ; %bb.3 +; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX942-NEXT: s_waitcnt vmcnt(1) +; GFX942-NEXT: v_mov_b32_e32 v2, 0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() @@ -617,30 +544,27 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac ; GFX942-LABEL: v8i8_multi_block: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX942-NEXT: v_and_b32_e32 v5, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v6, 3, v5 -; GFX942-NEXT: v_mov_b32_e32 v4, 0 -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 15, v5 +; GFX942-NEXT: v_and_b32_e32 v4, 0x3ff, v0 +; GFX942-NEXT: v_lshlrev_b32_e32 v5, 3, v4 +; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v4 +; GFX942-NEXT: v_cmp_gt_u32_e64 s[0:1], 15, v4 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9] +; GFX942-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11] +; GFX942-NEXT: s_and_b64 s[2:3], s[0:1], vcc ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[0:1], v[2:3] -; GFX942-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX942-NEXT: s_cbranch_execz .LBB11_4 -; GFX942-NEXT: ; %bb.1: ; %bb.1 -; GFX942-NEXT: global_load_dwordx2 v[0:1], v6, s[10:11] -; GFX942-NEXT: v_cmp_gt_u32_e32 vcc, 7, v5 -; GFX942-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX942-NEXT: s_cbranch_execz .LBB11_3 -; GFX942-NEXT: ; %bb.2: ; %bb.2 -; GFX942-NEXT: v_mov_b32_e32 v5, 0 -; GFX942-NEXT: global_store_dwordx2 v5, v[2:3], s[12:13] -; GFX942-NEXT: .LBB11_3: ; %Flow -; GFX942-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX942-NEXT: .LBB11_4: ; %bb.3 +; GFX942-NEXT: v_cndmask_b32_e64 v5, v1, v3, s[0:1] +; GFX942-NEXT: v_cndmask_b32_e64 v4, v0, v2, s[0:1] +; GFX942-NEXT: s_and_saveexec_b64 s[0:1], s[2:3] +; GFX942-NEXT: s_cbranch_execz .LBB11_2 +; GFX942-NEXT: ; %bb.1: ; %bb.2 +; GFX942-NEXT: v_mov_b32_e32 v4, 0 +; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[12:13] +; GFX942-NEXT: v_mov_b64_e32 v[4:5], v[2:3] +; GFX942-NEXT: .LBB11_2: ; %bb.3 ; GFX942-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: global_store_dwordx2 v4, v[0:1], s[14:15] +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: global_store_dwordx2 v0, v[4:5], s[14:15] ; GFX942-NEXT: s_endpgm entry: %idx = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll new file mode 100644 index 0000000000000..7f2d12eba5b73 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/flatten-cfg-with-phi.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --tool ./opt --version 5 +; RUN: opt < %s -passes=flatten-cfg -S | FileCheck %s + +define i1 @_Z7compareRK1SS1_(ptr %a, ptr %b) { +; CHECK-LABEL: @_Z7compareRK1SS1_( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %0 = load i32, ptr %a, align 4, !tbaa !3 +; CHECK-NEXT: %1 = load i32, ptr %b, align 4, !tbaa !3 +; CHECK-NEXT: %cmp.i = icmp sge i32 %0, %1 +; CHECK-NEXT: %cmp.i19 = icmp eq i32 %0, %1 +; CHECK-NEXT: %2 = and i1 %cmp.i, %cmp.i19 +; CHECK-NEXT: %3 = select i1 %cmp.i, i1 false, i1 true +; CHECK-NEXT: br i1 %2, label %land.rhs, label %lor.end +; CHECK-EMPTY: +; CHECK-NEXT: land.rhs: ; preds = %entry +; CHECK-NEXT: %y = getelementptr inbounds nuw i8, ptr %a, i64 4 +; CHECK-NEXT: %4 = load i32, ptr %y, align 4, !tbaa !8 +; CHECK-NEXT: %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4 +; CHECK-NEXT: %5 = load i32, ptr %y14, align 4, !tbaa !8 +; CHECK-NEXT: %cmp = icmp slt i32 %4, %5 +; CHECK-NEXT: br label %lor.end +; CHECK-EMPTY: +; CHECK-NEXT: lor.end: ; preds = %land.rhs, %entry +; CHECK-NEXT: %6 = phi i1 [ %cmp, %land.rhs ], [ %3, %entry ] +; CHECK-NEXT: ret i1 %6 +entry: + %0 = load i32, ptr %a, align 4, !tbaa !3 + %1 = load i32, ptr %b, align 4, !tbaa !3 + %cmp.i = icmp slt i32 %0, %1 + br i1 %cmp.i, label %lor.end, label %lor.rhs + +lor.rhs: ; preds = %entry + %cmp.i19 = icmp eq i32 %0, %1 + br i1 %cmp.i19, label %land.rhs, label %lor.end + +land.rhs: ; preds = %lor.rhs + %y = getelementptr inbounds nuw i8, ptr %a, i64 4 + %2 = load i32, ptr %y, align 4, !tbaa !8 + %y14 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %3 = load i32, ptr %y14, align 4, !tbaa !8 + %cmp = icmp slt i32 %2, %3 + br label %lor.end + +lor.end: ; preds = %lor.rhs, %land.rhs, %entry + %4 = phi i1 [ true, %entry ], [ false, %lor.rhs ], [ %cmp, %land.rhs ] + ret i1 %4 +} + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"uwtable", i32 2} +!2 = !{!"clang"} +!3 = !{!4, !5, i64 0} +!4 = !{!"_ZTS1S", !5, i64 0, !5, i64 4} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C++ TBAA"} +!8 = !{!4, !5, i64 4} +!9 = !{!5, !5, i64 0}