diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index e388efe73cddb..b98c3799332a1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3390,6 +3390,9 @@ let SubtargetPredicate = isGFX9Plus in { let True16Predicate = NotHasTrue16BitInsts in def : PackB32Pat; +let True16Predicate = UseRealTrue16Insts in + def : PackB32Pat; + let True16Predicate = UseFakeTrue16Insts in def : PackB32Pat; } // End SubtargetPredicate = isGFX9Plus diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll index 84a3a3e88d238..32d8aa18d9713 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll @@ -160,14 +160,9 @@ define amdgpu_kernel void @ceil_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll index 9909cfd32b11f..f6a9fadb33865 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll @@ -161,14 +161,9 @@ define amdgpu_kernel void @floor_v2f16( ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l -; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll index 53c26cadbf75a..ff1c3da1d5fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll @@ -480,9 +480,8 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32: @@ -610,9 +609,7 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16: @@ -737,15 +734,13 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32: @@ -891,12 +886,9 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16: @@ -1036,24 +1028,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) { ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff ; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32: @@ -1238,20 +1227,14 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) { ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v6.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 -; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16: diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 245df6684384c..94b22b79f6632 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_i32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index bc1b102d33de1..2a2fd93bc2d0b 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -237,14 +237,9 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16( ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; @@ -338,17 +333,13 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16( ; GFX11-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v2, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 2eba67b06bae1..072151dd6f5a0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-FAKE16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GCN-REAL16 %s +; RUN: llc -global-isel -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-REAL16 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 @@ -38,6 +42,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -87,6 +174,89 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_subrev_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32_v2f16_sub: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_subrev_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -136,6 +306,78 @@ define amdgpu_kernel void @fptrunc( ; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 ; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: fptrunc: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-GCN-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-GCN-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GCN-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: fptrunc: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s6, -1 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s10, s6 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s11, s7 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s8, s2 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s9, s3 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s4, s0 +; GFX11-GCN-REAL16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-GCN-REAL16-NEXT: s_mov_b32 s5, s1 +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-GCN-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-GCN-REAL16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: fptrunc: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-REAL16-NEXT: v_cvt_f16_f32_e32 v0.h, s3 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-REAL16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-REAL16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-REAL16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { %a.val = load <2 x float>, ptr addrspace(1) %a @@ -178,6 +420,89 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, |v1|, |v0| +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fabs: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, |v0.l|, |v0.h| +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext @@ -229,6 +554,89 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace( ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND ; GISEL-NEXT: s_endpgm +; +; GFX11-GCN-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-FAKE16: ; %bb.0: +; GFX11-GCN-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GCN-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GCN-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GCN-FAKE16-NEXT: ; use v0 +; GFX11-GCN-FAKE16-NEXT: ;;#ASMEND +; GFX11-GCN-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3] glc dlc +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, 2.0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, -v1, -v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMSTART +; GFX11-GISEL-FAKE16-NEXT: ; use v0 +; GFX11-GISEL-FAKE16-NEXT: ;;#ASMEND +; GFX11-GISEL-FAKE16-NEXT: s_endpgm +; +; GFX11-GCN-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GCN-REAL16: ; %bb.0: +; GFX11-GCN-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GCN-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GCN-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GCN-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-GCN-REAL16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l +; GFX11-GCN-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v0.h +; GFX11-GCN-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GCN-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GCN-REAL16-NEXT: ;;#ASMSTART +; GFX11-GCN-REAL16-NEXT: ; use v0 +; GFX11-GCN-REAL16-NEXT: ;;#ASMEND +; GFX11-GCN-REAL16-NEXT: s_endpgm +; +; GFX11-GISEL-REAL16-LABEL: v_pack_b32.fneg: +; GFX11-GISEL-REAL16: ; %bb.0: +; GFX11-GISEL-REAL16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-REAL16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-REAL16-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.l, 2.0, v1.l +; GFX11-GISEL-REAL16-NEXT: v_add_f16_e32 v0.h, 2.0, v2.l +; GFX11-GISEL-REAL16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-REAL16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h +; GFX11-GISEL-REAL16-NEXT: ;;#ASMSTART +; GFX11-GISEL-REAL16-NEXT: ; use v0 +; GFX11-GISEL-REAL16-NEXT: ;;#ASMEND +; GFX11-GISEL-REAL16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %in0.gep = getelementptr inbounds half, ptr addrspace(1) %in0, i64 %tid.ext