diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b6023b4f3fbcf..d4091ed6e770c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -3579,15 +3579,22 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con return SDValue(); } - assert(N0.getSimpleValueType() == MVT::f64); + return LowerF64ToF16Safe(N0, DL, DAG); +} + +// return node in i32 +SDValue AMDGPUTargetLowering::LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, + SelectionDAG &DAG) const { + assert(Src.getSimpleValueType() == MVT::f64); // f64 -> f16 conversion using round-to-nearest-even rounding mode. + // TODO: We can generate better code for True16. const unsigned ExpMask = 0x7ff; const unsigned ExpBiasf64 = 1023; const unsigned ExpBiasf16 = 15; SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); - SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0); + SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src); SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U, DAG.getConstant(32, DL, MVT::i64)); UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32); @@ -3661,8 +3668,7 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) con Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign, DAG.getConstant(0x8000, DL, MVT::i32)); - V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); - return DAG.getZExtOrTrunc(V, DL, Op.getValueType()); + return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V); } SDValue AMDGPUTargetLowering::LowerFP_TO_INT(const SDValue Op, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index a42214865ccfd..0dd2183b72b24 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -97,6 +97,9 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, + SelectionDAG &DAG) const; + SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; protected: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b08b6b46fc52c..76c4546f1207e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6903,7 +6903,17 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (Op.getOpcode() != ISD::FP_ROUND) return Op; - SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); + if (!Subtarget->has16BitInsts()) { + SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); + return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); + } + if (getTargetMachine().Options.UnsafeFPMath) { + SDValue Flags = Op.getOperand(1); + SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags); + return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags); + } + SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16); return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc); } diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index 6ab17c91c1439..0a900f904bec5 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -7,8 +7,10 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s define amdgpu_kernel void @fptrunc_f32_to_f16( ; SI-SDAG-LABEL: fptrunc_f32_to_f16: @@ -131,35 +133,65 @@ define amdgpu_kernel void @fptrunc_f32_to_f16( ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_f32_to_f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_f32_to_f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -298,39 +330,73 @@ define amdgpu_kernel void @fptrunc_f64_to_f16( ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_f64_to_f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_f64_to_f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -477,41 +543,77 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f32_to_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b64 v[1:2], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f32_to_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f32_to_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, s3 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f32_to_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s3 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -618,7 +720,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 ; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1 ; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-SDAG-NEXT: s_endpgm ; @@ -654,9 +756,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] ; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2 -; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX950-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2 ; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX950-SDAG-NEXT: s_endpgm ; @@ -676,48 +776,89 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] -; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, v[2:3] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] -; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, v[0:1] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v2, v[2:3] +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, v[0:1] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2 +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5] +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7] +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -848,35 +989,65 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16( ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fneg_fptrunc_f32_to_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fneg_fptrunc_f32_to_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fneg_fptrunc_f32_to_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fneg_fptrunc_f32_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1008,35 +1179,65 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16( ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fabs_fptrunc_f32_to_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |v0| +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fabs_fptrunc_f32_to_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fabs_fptrunc_f32_to_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2| +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fabs_fptrunc_f32_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -1168,35 +1369,65 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16( ; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0| -; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2| -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -|v0| +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -|v0| +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, -|s2| +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, -|s2| +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -1329,39 +1560,73 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -1494,39 +1759,73 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0| -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2| -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |v0| +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2| +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e64 v0, |s2| +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: @@ -1668,39 +1967,73 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32( ; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX950-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32: -; GFX11-SDAG: ; %bb.0: ; %entry -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: s_mov_b32 s6, -1 -; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-SDAG-NEXT: s_mov_b32 s10, s6 -; GFX11-SDAG-NEXT: s_mov_b32 s11, s7 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_mov_b32 s8, s2 -; GFX11-SDAG-NEXT: s_mov_b32 s9, s3 -; GFX11-SDAG-NEXT: s_mov_b32 s4, s0 -; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-SDAG-NEXT: s_mov_b32 s5, s1 -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-SDAG-NEXT: s_endpgm -; -; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32: -; GFX11-GISEL: ; %bb.0: ; %entry -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2 -; GFX11-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry +; GFX11-SDAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-TRUE16-NEXT: s_endpgm +; +; GFX11-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry +; GFX11-SDAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-SDAG-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-SDAG-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-SDAG-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-SDAG-FAKE16-NEXT: s_endpgm +; +; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry +; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 +; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %entry +; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2 +; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-GISEL-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll index 41cbbe57d7a36..49c563eef5d82 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll @@ -10,8 +10,10 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-UNSAFE-GISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in) { ; SI-LABEL: fptrunc_f64_to_f32: @@ -514,29 +516,53 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in) ; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-SAFE-GISEL-NEXT: s_endpgm ; -; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16: -; GFX11-UNSAFE-SDAG: ; %bb.0: -; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1 -; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-SDAG-NEXT: s_endpgm -; -; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16: -; GFX11-UNSAFE-GISEL: ; %bb.0: -; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] -; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1 -; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-UNSAFE-GISEL-NEXT: s_endpgm +; GFX11-UNSAFE-DAG-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-TRUE16: ; %bb.0: +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-DAG-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-DAG-TRUE16-NEXT: s_endpgm +; +; GFX11-UNSAFE-DAG-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-DAG-FAKE16: ; %bb.0: +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-DAG-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-DAG-FAKE16-NEXT: s_endpgm +; +; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-GISEL-TRUE16: ; %bb.0: +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-GISEL-TRUE16-NEXT: s_endpgm +; +; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16: +; GFX11-UNSAFE-GISEL-FAKE16: ; %bb.0: +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3] +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-UNSAFE-GISEL-FAKE16-NEXT: s_endpgm %result = fptrunc double %in to half %result_i16 = bitcast half %result to i16 store i16 %result_i16, ptr addrspace(1) %out