Skip to content

Commit c6f753b

Browse files
authored
[AMDGPU][True16][MC] true16 for v_pack_b32_f16 (#119630)
Support true16 format for v_pack_b32_f16 in MC. Since we are replacing v_alignbit_b32 to `v_pack_b32_f16_t16/v_pack_b32_f16_fake16` in Post-GFX11, have to update the CodeGen pattern for `v_pack_b32_f16_fake16 `to get CodeGen test passing. There is no pattern modified/created, but just replacing the `v_pack_b32_f16` with fake16 format. Some of the true16 CodeGen test are impacted since `v_pack_b32_f16` selection are removed in Post-GFX11 while `v_pack_b32_f16_t16` are not yet supported. The CodeGen patch for `v_pack_b32_f16_t16` will be done is the following patch.
1 parent e6a6351 commit c6f753b

19 files changed

+440
-184
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3377,15 +3377,22 @@ def : GCNPat <
33773377

33783378
} // end foreach Ty
33793379

3380+
} // End SubtargetPredicate = HasVOP3PInsts
33803381

33813382
let AddedComplexity = 5 in {
3382-
def : GCNPat <
3383+
class PackB32Pat<Instruction inst> : GCNPat <
33833384
(v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
33843385
(f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
3385-
(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
3386+
(inst $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
33863387
>;
33873388
}
3388-
} // End SubtargetPredicate = HasVOP3PInsts
3389+
let SubtargetPredicate = isGFX9Plus in {
3390+
let True16Predicate = NotHasTrue16BitInsts in
3391+
def : PackB32Pat<V_PACK_B32_F16_e64>;
3392+
3393+
let True16Predicate = UseFakeTrue16Insts in
3394+
def : PackB32Pat<V_PACK_B32_F16_fake16_e64>;
3395+
} // End SubtargetPredicate = isGFX9Plus
33893396

33903397
// With multiple uses of the shift, this will duplicate the shift and
33913398
// increase register pressure.

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -646,7 +646,7 @@ defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32
646646
defm V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
647647
defm V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
648648

649-
defm V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
649+
defm V_PACK_B32_F16 : VOP3Inst_t16 <"v_pack_b32_f16", VOP_B32_F16_F16>;
650650

651651
let isReMaterializable = 1 in {
652652
defm V_SUB_I32 : VOP3Inst <"v_sub_i32", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
@@ -1754,7 +1754,7 @@ defm V_MIN_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30
17541754
defm V_MIN_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">;
17551755
defm V_ADD_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">;
17561756
defm V_SUB_NC_I16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">;
1757-
defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11_gfx12<0x311>;
1757+
defm V_PACK_B32_F16 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x311, "v_pack_b32_f16">;
17581758
defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
17591759
defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
17601760
defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x325, "V_SUB_I32", "v_sub_nc_i32">;

llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,11 @@ define amdgpu_kernel void @ceil_v2f16(
163163
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
164164
; GFX11-NEXT: v_ceil_f16_e32 v0.h, v1.l
165165
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
166-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
166+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
167167
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
168-
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
168+
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
169+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
170+
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
169171
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
170172
; GFX11-NEXT: s_endpgm
171173
;

llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,11 @@ define amdgpu_kernel void @floor_v2f16(
164164
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
165165
; GFX11-NEXT: v_floor_f16_e32 v0.h, v1.l
166166
; GFX11-NEXT: v_mov_b16_e32 v1.l, v0.l
167-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
167+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
168168
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
169-
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
169+
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
170+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
171+
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1
170172
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
171173
; GFX11-NEXT: s_endpgm
172174
;

llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -480,11 +480,9 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
480480
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
481481
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
482482
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
483-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
484-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
485-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
486-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
487-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
483+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
484+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
485+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
488486
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
489487
;
490488
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
@@ -610,12 +608,11 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
610608
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
611609
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
612610
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v1.l
613-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
614-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
615-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
616611
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
617-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
618-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
612+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v3.l, v2.l
613+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
614+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
615+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
619616
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
620617
;
621618
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
@@ -744,12 +741,11 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
744741
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v5.l, v3.l
745742
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
746743
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v4, s0, 0x7fff
747-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
748-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l
749-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
750744
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
745+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h
751746
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v2.l
752-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v3, v0
747+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
748+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v3, v0, 0x5040100
753749
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
754750
;
755751
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
@@ -900,7 +896,7 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
900896
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
901897
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
902898
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
903-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
899+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
904900
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
905901
;
906902
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
@@ -1043,24 +1039,21 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
10431039
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
10441040
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff
10451041
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff
1042+
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
10461043
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v2, v2, s0, 0x7fff
10471044
; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff
1048-
; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
10491045
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
10501046
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v6.l, v3.l
1051-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
1047+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
10521048
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1049+
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v0.l, v2.l
10531050
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v1.l, v4.l
1054-
; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v7.l, v5.l
1055-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
1056-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
1057-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
1058-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l
1059-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
1060-
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1061-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
1062-
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1063-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
1051+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1052+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
1053+
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
1054+
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1055+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
1056+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100
10641057
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
10651058
;
10661059
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
@@ -1257,8 +1250,8 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
12571250
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
12581251
; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
12591252
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1260-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v2, v0
1261-
; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v1, v3, v1
1253+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
1254+
; GFX11-SDAG-TRUE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100
12621255
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
12631256
;
12641257
; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:

llvm/test/CodeGen/AMDGPU/sitofp.f16.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,11 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
240240
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
241241
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v1.l
242242
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
243-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
243+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244244
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
245-
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
245+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
246+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
246248
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
247249
; GFX11-TRUE16-NEXT: s_endpgm
248250
;
@@ -344,8 +346,9 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
344346
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
345347
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
346348
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
347-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
348-
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
349+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
350+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
351+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
349352
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
350353
; GFX11-TRUE16-NEXT: s_endpgm
351354
;

llvm/test/CodeGen/AMDGPU/uitofp.f16.ll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -240,9 +240,11 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
240240
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
241241
; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.h, v1.l
242242
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
243-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
243+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
244244
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
245-
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
245+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
246+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
247+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
246248
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
247249
; GFX11-TRUE16-NEXT: s_endpgm
248250
;
@@ -344,8 +346,9 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
344346
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
345347
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
346348
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
347-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
348-
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v1, v0
349+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
350+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1
351+
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1
349352
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
350353
; GFX11-TRUE16-NEXT: s_endpgm
351354
;

llvm/test/MC/AMDGPU/gfx11_asm_vop3.s

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5435,11 +5435,11 @@ v_or_b16 v5.l, v255.l, v255.h
54355435
v_or_b16 v255.h, 0xfe0b, vcc_hi
54365436
// GFX11: [0xff,0x40,0x63,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
54375437

5438-
v_pack_b32_f16 v5, v1, v2
5439-
// GFX11: v_pack_b32_f16 v5, v1, v2 ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x05,0x02,0x00]
5438+
v_pack_b32_f16 v5, v1.l, v2.l
5439+
// GFX11: v_pack_b32_f16 v5, v1.l, v2.l ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x05,0x02,0x00]
54405440

5441-
v_pack_b32_f16 v5, v255, v255
5442-
// GFX11: v_pack_b32_f16 v5, v255, v255 ; encoding: [0x05,0x00,0x11,0xd7,0xff,0xff,0x03,0x00]
5441+
v_pack_b32_f16 v5, v255.l, v255.l
5442+
// GFX11: v_pack_b32_f16 v5, v255.l, v255.l ; encoding: [0x05,0x00,0x11,0xd7,0xff,0xff,0x03,0x00]
54435443

54445444
v_pack_b32_f16 v5, s1, s2
54455445
// GFX11: v_pack_b32_f16 v5, s1, s2 ; encoding: [0x05,0x00,0x11,0xd7,0x01,0x04,0x00,0x00]
@@ -5471,7 +5471,7 @@ v_pack_b32_f16 v5, null, exec_lo
54715471
v_pack_b32_f16 v5, -1, exec_hi
54725472
// GFX11: v_pack_b32_f16 v5, -1, exec_hi ; encoding: [0x05,0x00,0x11,0xd7,0xc1,0xfe,0x00,0x00]
54735473

5474-
v_pack_b32_f16 v5, 0.5, -m0 op_sel:[0,0,0]
5474+
v_pack_b32_f16 v5, 0.5, -m0
54755475
// GFX11: v_pack_b32_f16 v5, 0.5, -m0 ; encoding: [0x05,0x00,0x11,0xd7,0xf0,0xfa,0x00,0x40]
54765476

54775477
v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0]
@@ -5480,6 +5480,18 @@ v_pack_b32_f16 v5, -src_scc, |vcc_lo| op_sel:[1,0,0]
54805480
v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0]
54815481
// GFX11: v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| op_sel:[0,1,0] ; encoding: [0xff,0x13,0x11,0xd7,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
54825482

5483+
v_pack_b32_f16 v5, v1.h, v2.l
5484+
// GFX11: v_pack_b32_f16 v5, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x11,0xd7,0x01,0x05,0x02,0x00]
5485+
5486+
v_pack_b32_f16 v5, v255.l, v255.h
5487+
// GFX11: v_pack_b32_f16 v5, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x11,0xd7,0xff,0xff,0x03,0x00]
5488+
5489+
v_pack_b32_f16 v5, -src_scc, |vcc_lo|
5490+
// GFX11: v_pack_b32_f16 v5, -src_scc, |vcc_lo| ; encoding: [0x05,0x02,0x11,0xd7,0xfd,0xd4,0x00,0x20]
5491+
5492+
v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi|
5493+
// GFX11: v_pack_b32_f16 v255, -|0xfe0b|, -|vcc_hi| ; encoding: [0xff,0x03,0x11,0xd7,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
5494+
54835495
v_perm_b32 v5, v1, v2, s3
54845496
// GFX11: v_perm_b32 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x44,0xd6,0x01,0x05,0x0e,0x00]
54855497

0 commit comments

Comments
 (0)