diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index bb78e77a9dc1a..0d89b1270465c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1803,19 +1803,16 @@ class getVOP3SrcForVT { 1 : VSrc_b32); } -// Returns the vreg register class to use for sources of VOP3 instructions for the -// given VT. -class getVOP3VRegSrcForVT { - RegisterOperand ret = - !cond(!eq(VT.Size, 128) : RegisterOperand, - !eq(VT.Size, 96) : RegisterOperand, - !eq(VT.Size, 64) : RegisterOperand, - !eq(VT.Size, 48) : RegisterOperand, - !eq(VT.Size, 16) : !if(IsTrue16, - !if(IsFake16, RegisterOperand, - RegisterOperand), - RegisterOperand), - 1 : RegisterOperand); +// VGPR only VOP3 src with 9 bit encoding +class getVOP3VRegSrcForVT { + RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024, + !eq(VT.Size, 512) : VRegSrc_512, + !eq(VT.Size, 256) : VRegSrc_256, + !eq(VT.Size, 192) : VRegSrc_192, + !eq(VT.Size, 128) : VRegSrc_128, + !eq(VT.Size, 96) : VRegSrc_96, + !eq(VT.Size, 64) : VRegSrc_64, + 1 : VRegSrc_32); } // Src2 of VOP3 DPP instructions cannot be a literal @@ -2852,6 +2849,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>; def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>; def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>; def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>; +def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>; def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>; def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>; def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 900c91731aa1b..15f2dd47c7f99 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -418,12 +418,27 @@ def VOP_MADMK_F16_fake16 : VOP_MADMK { } def VOP_MADMK_F32 : VOP_MADMK ; +// Returns the vreg register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3VRegForVT { + RegisterOperand ret = + !cond(!eq(VT.Size, 128) : RegisterOperand, + !eq(VT.Size, 96) : RegisterOperand, + !eq(VT.Size, 64) : RegisterOperand, + !eq(VT.Size, 48) : RegisterOperand, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, RegisterOperand, + RegisterOperand), + RegisterOperand), + 1 : RegisterOperand); +} + // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory // and processing time but it makes it easier to convert to mad. class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT.ret:$src2); // Src2 must accept the same operand types as vdst, namely VGPRs only - let Src2RC64 = getVOP3VRegSrcForVT.ret; + let Src2RC64 = getVOP3VRegForVT.ret; let Ins64 = getIns64.ret; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index afafc2ecccfaf..1447804871809 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1052,7 +1052,11 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : let HasFP4DstByteSel = 1; } -def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile, VOP3_OPSEL> { +class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile + : VOP3_Profile { + + let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT.ret, + getVOP3SrcForVT.ret); let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0, Int32InputMods: $src1_modifiers, Src1RC64:$src1, FP32InputMods: $src2_modifiers, Src2RC64:$src2, @@ -1100,6 +1104,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile : VOP3_Profile

{ let HasExt32BitDPP = 0; let HasExtVOP3DPP = 0; let HasExt64BitDPP = 0; + + // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for + // any operand slots > 32 bit. + let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT.ret, + getVOP3SrcForVT.ret); } let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in { @@ -1141,7 +1150,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in let Constraints = "@earlyclobber $vdst" in { defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile>; - defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>; + defm V_CVT_SCALEF32_SR_PK_FP4_F32 + : VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", + VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile< + VOP_I32_V2F32_I32_F32>>; } } defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index a0ba97d3b639c..1ab27337632b6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -1282,3 +1282,1162 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte3(<2 x bfloat> %src0, float %scale, i %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3) ret i32 %ret } + +define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl_inreg_src(<16 x float> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl_inreg_src(<16 x float> inreg inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true) + ret <2 x half> %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 0) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2) + ret float %ret +} + +define float @test_cvt_scalef32_f32_fp8_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 3) + ret float %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) { +; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true) + ret <2 x half> %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 0) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2) + ret float %ret +} + +define float @test_cvt_scalef32_f32_bf8_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 3) + ret float %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v3, v0, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v0, v3 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_inreg_src(<2 x i16> %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 false) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 true) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 false) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 true) + ret <2 x float> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false) + ret <2 x i16> %ret +} + +define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 op_sel:[0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true) + ret <2 x i16> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 0) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2) + ret <2 x float> %ret +} + +define <2 x float> @test_cvt_scale_f32_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f32_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 3) + ret <2 x float> %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte0_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scale_fp4_f32_byte3_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) { +; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 3) + ret i32 %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 0) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scale_f16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_f16_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 3) + ret <2 x half> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 0) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[0,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scale_bf16_fp4_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,1,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 3) + ret <2 x bfloat> %ret +} + +define <32 x float> @test_cvt_scale_pk32_f32_fp6_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale) + ret <32 x float> %ret +} + +define <32 x float> @test_cvt_scale_pk32_f32_bf6_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale) + ret <32 x float> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale) + ret <32 x half> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0) + ret <32 x half> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_mov_b32_e32 v18, s0 +; GCN-NEXT: v_mov_b32_e32 v19, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v22, s16 +; GCN-NEXT: v_mov_b32_e32 v23, s17 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[18:23], v16 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0) + ret <32 x bfloat> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale) + ret <32 x half> %ret +} + +define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 +; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0) + ret <32 x half> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_mov_b32_e32 v18, s0 +; GCN-NEXT: v_mov_b32_e32 v19, s1 +; GCN-NEXT: v_mov_b32_e32 v20, s2 +; GCN-NEXT: v_mov_b32_e32 v21, s3 +; GCN-NEXT: v_mov_b32_e32 v22, s16 +; GCN-NEXT: v_mov_b32_e32 v23, s17 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[18:23], v16 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale) + ret <32 x bfloat> %ret +} + +define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) { +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0) + ret <32 x bfloat> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 true) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 false) + ret <2 x half> %ret +} + +define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 true) + ret <2 x half> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 false) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 true) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 false) + ret <2 x bfloat> %ret +} + +define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src(i32 inreg %src, float %scale) { +; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0 op_sel:[1,0,0] +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 true) + ret <2 x bfloat> %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte0_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_f16_byte3_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 3) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte0_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 0) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 2) + ret i32 %ret +} + +define i32 @test_cvt_scalef32_fp4_bf16_byte3_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) { +; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte3_inreg_src: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,1] +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: s_setpc_b64 s[30:31] + %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3) + ret i32 %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll index 517c87193598d..4e5b85344197a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -294,3 +294,404 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 ret void } + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) { +; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], s0 +; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v24 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: s_endpgm + %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0) + store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 + ret void +} + diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s index e0b832d8fe297..099916f48b5e7 100644 --- a/llvm/test/MC/AMDGPU/gfx950_err.s +++ b/llvm/test/MC/AMDGPU/gfx950_err.s @@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size +v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size +v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16 + +// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction +v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16 + +// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5 +v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5