-
Notifications
You must be signed in to change notification settings - Fork 14.5k
AMDGPU: Widen f16 minimum/maximum to v2f16 on gfx950 #128121
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesUnfortunately we only have the vector versions of v2f16 minimum3 Patch is 81.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128121.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 909ad07782fc6..0b13a53a0c989 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -869,8 +869,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMinimum3Maximum3F32())
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
- if (Subtarget->hasMinimum3Maximum3PKF16())
+ if (Subtarget->hasMinimum3Maximum3PKF16()) {
setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::v2f16, Legal);
+
+ // If only the vector form is available, we need to widen to a vector.
+ if (!Subtarget->hasMinimum3Maximum3F16())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f16, Custom);
+ }
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
@@ -5964,6 +5969,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
+ return lowerFMINIMUM_FMAXIMUM(Op, DAG);
case ISD::FLDEXP:
case ISD::STRICT_FLDEXP:
return lowerFLDEXP(Op, DAG);
@@ -5985,8 +5993,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMUL:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINIMUM:
- case ISD::FMAXIMUM:
case ISD::FMINIMUMNUM:
case ISD::FMAXIMUMNUM:
case ISD::UADDSAT:
@@ -6841,6 +6847,34 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
+ SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return splitBinaryVectorOp(Op, DAG);
+
+ assert(!Subtarget->hasIEEEMinMax() && !Subtarget->hasMinimum3Maximum3F16() &&
+ Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
+ "should not need to widen f16 minimum/maximum to v2f16");
+
+ // Widen f16 operation to v2f16
+
+ // fminimum f16:x, f16:y ->
+ // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
+ // (v2f16 (scalar_to_vector y))), 0
+ SDLoc SL(Op);
+ SDValue WideSrc0 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
+ SDValue WideSrc1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
+
+ SDValue Widened =
+ DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
+ DAG.getConstant(0, SL, MVT::i32));
+}
+
SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
EVT VT = Op.getValueType();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1cd7f1b29e077..9b2c14862407a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -146,6 +146,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFMINIMUM_FMAXIMUM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue promoteUniformOpToI32(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 66de7d535db4b..f228824ff750e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -1252,19 +1252,27 @@ define half @v_fmaximum3_f16(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
ret half %max1
@@ -1281,19 +1289,27 @@ define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_commute:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %c, half %max0)
ret half %max1
@@ -1311,22 +1327,34 @@ define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %
; GFX12-NEXT: s_wait_alu 0xf1ff
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_fmaximum3_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_max_f16_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX942-LABEL: s_fmaximum3_f16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v0, s1
+; GFX942-NEXT: v_max_f16_e32 v1, s0, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, s2, v0
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, s2, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f16:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s1, s1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, s2, s2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
%cast = bitcast half %max1 to i16
@@ -1346,19 +1374,28 @@ define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs0:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%max0 = call half @llvm.maximum.f16(half %a.fabs, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1376,19 +1413,28 @@ define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, v0, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs1:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, v0, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e32 v1, v0, v2
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call half @llvm.fabs.f16(half %b)
%max0 = call half @llvm.maximum.f16(half %a, half %b.fabs)
%max1 = call half @llvm.maximum.f16(half %max0, half %c)
@@ -1406,19 +1452,28 @@ define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs2:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e32 v3, v0, v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e32 vcc, v0, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call half @llvm.fabs.f16(half %c)
%max0 = call half @llvm.maximum.f16(half %a, half %b)
%max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs)
@@ -1436,19 +1491,30 @@ define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, |v0|, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, |v0|, |v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, |v0|, |v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, |v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, |v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX950-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX950-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1468,19 +1534,30 @@ define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -v0, -v1
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -v0, -v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -v2
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg half %a
%b.fneg = fneg half %b
%c.fneg = fneg half %c
@@ -1500,19 +1577,30 @@ define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7e00
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f16_e64 v1, v0, -|v2|
-; GFX9-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_max_f16_e64 v3, -|v0|, -|v1|
+; GFX942-NEXT: v_mov_b32_e32 v4, 0x7e00
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, -|v0|, -|v1|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX942-NEXT: v_max_f16_e64 v1, v0, -|v2|
+; GFX942-NEXT: v_cmp_o_f16_e64 vcc, v0, -|v2|
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f16_fneg_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX950-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX950-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v1, v1
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_pk_maximum3_f16 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
%b.fabs = call half @llvm.fabs.f16(half %b)
%c.fabs = call half @llvm.fabs.f16(half %c)
@@ -1535,19 +1623,28 @@ define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) {
; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f16_fneg0:
-; GFX9:...
[truncated]
|
I somehow missed this in the initial upstreaming |
Merge activity
|
f56a833
to
8b17104
Compare
Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).
8b17104
to
e77fd5b
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/4965 Here is the relevant piece of the build log for the reference
|
/cherry-pick e729dc7 |
/pull-request #128132 |
Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y). (cherry picked from commit e729dc7)
Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).
Unfortunately we only have the vector versions of v2f16 minimum3 and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).
Unfortunately we only have the vector versions of v2f16 minimum3
and maximum. Widen to v2f16 so we can lower as minimum333(x, y, y).