Skip to content

Commit 998a431

Browse files
committed
[AMDGPU] Fix canonicalization of truncated values.
We were relying on roundings to implicitly canonicalize, which is generally safe, except that roundings may be optimized away, and more generally, other operations may be optimized away as well. To solve this, as suggested by @arsenm, keep fcanonicalize nodes around for longer. Some tests revealed cases where we no longer figured out that previous operations already produced canonicalized results but we could easily change that; this commit includes those changes. Other tests revealed cases where we no longer figure out that previous operations already produced canonicalized results but larger changes are needed to detect that; this commit disables those tests or updates the expected results. Fixes llvm#82937.
1 parent a9304ed commit 998a431

13 files changed

+1014
-1402
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,26 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
194194
}];
195195
}
196196

197-
class is_canonicalized<SDPatternOperator op> : PatFrag<
197+
class is_canonicalized_1<SDPatternOperator op> : PatFrag<
198+
(ops node:$src0),
199+
(op $src0),
200+
[{
201+
const SITargetLowering &Lowering =
202+
*static_cast<const SITargetLowering *>(getTargetLowering());
203+
204+
return Lowering.isCanonicalized(*CurDAG, N->getOperand(0));
205+
}]> {
206+
207+
// TODO: Improve the Legalizer for g_build_vector in Global Isel to match this class
208+
let GISelPredicateCode = [{
209+
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
210+
MF.getSubtarget().getTargetLowering());
211+
212+
return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF));
213+
}];
214+
}
215+
216+
class is_canonicalized_2<SDPatternOperator op> : PatFrag<
198217
(ops node:$src0, node:$src1),
199218
(op $src0, $src1),
200219
[{

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12481,6 +12481,10 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1248112481
case ISD::FREM:
1248212482
case ISD::FP_ROUND:
1248312483
case ISD::FP_EXTEND:
12484+
case ISD::FP16_TO_FP:
12485+
case ISD::FP_TO_FP16:
12486+
case ISD::BF16_TO_FP:
12487+
case ISD::FP_TO_BF16:
1248412488
case ISD::FLDEXP:
1248512489
case AMDGPUISD::FMUL_LEGACY:
1248612490
case AMDGPUISD::FMAD_FTZ:
@@ -12500,6 +12504,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1250012504
case AMDGPUISD::CVT_F32_UBYTE1:
1250112505
case AMDGPUISD::CVT_F32_UBYTE2:
1250212506
case AMDGPUISD::CVT_F32_UBYTE3:
12507+
case AMDGPUISD::FP_TO_FP16:
12508+
case AMDGPUISD::SIN_HW:
12509+
case AMDGPUISD::COS_HW:
1250312510
return true;
1250412511

1250512512
// It can/will be lowered or combined as a bit operation.
@@ -12509,6 +12516,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1250912516
case ISD::FCOPYSIGN:
1251012517
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
1251112518

12519+
case ISD::AND:
12520+
if (Op.getValueType() == MVT::i32) {
12521+
// Be careful as we only know it is a bitcast floating point type. It
12522+
// could be f32, v2f16, we have no way of knowing. Luckily the constant
12523+
// value that we optimize for, which comes up in fp32 to bf16 conversions,
12524+
// is valid to optimize for all types.
12525+
if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12526+
if (RHS->getZExtValue() == 0xffff0000) {
12527+
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12528+
}
12529+
}
12530+
}
12531+
break;
12532+
1251212533
case ISD::FSIN:
1251312534
case ISD::FCOS:
1251412535
case ISD::FSINCOS:
@@ -12574,6 +12595,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1257412595
return false;
1257512596

1257612597
case ISD::BITCAST:
12598+
// TODO: This is incorrect as it loses track of the operand's type. We may
12599+
// end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12600+
// same bits that are canonicalized in one type need not be in the other.
1257712601
return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
1257812602
case ISD::TRUNCATE: {
1257912603
// Hack round the mess we make when legalizing extract_vector_elt
@@ -12603,20 +12627,21 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1260312627
case Intrinsic::amdgcn_trig_preop:
1260412628
case Intrinsic::amdgcn_log:
1260512629
case Intrinsic::amdgcn_exp2:
12630+
case Intrinsic::amdgcn_sqrt:
1260612631
return true;
1260712632
default:
1260812633
break;
1260912634
}
1261012635

12611-
[[fallthrough]];
12636+
break;
1261212637
}
1261312638
default:
12614-
// FIXME: denormalsEnabledForType is broken for dynamic
12615-
return denormalsEnabledForType(DAG, Op.getValueType()) &&
12616-
DAG.isKnownNeverSNaN(Op);
12639+
break;
1261712640
}
1261812641

12619-
llvm_unreachable("invalid operation");
12642+
// FIXME: denormalsEnabledForType is broken for dynamic
12643+
return denormalsEnabledForType(DAG, Op.getValueType()) &&
12644+
DAG.isKnownNeverSNaN(Op);
1262012645
}
1262112646

1262212647
bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
@@ -12840,27 +12865,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
1284012865
}
1284112866
}
1284212867

12843-
unsigned SrcOpc = N0.getOpcode();
12844-
12845-
// If it's free to do so, push canonicalizes further up the source, which may
12846-
// find a canonical source.
12847-
//
12848-
// TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
12849-
// sNaNs.
12850-
if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
12851-
auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
12852-
if (CRHS && N0.hasOneUse()) {
12853-
SDLoc SL(N);
12854-
SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
12855-
N0.getOperand(0));
12856-
SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
12857-
DCI.AddToWorklist(Canon0.getNode());
12858-
12859-
return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
12860-
}
12861-
}
12862-
12863-
return isCanonicalized(DAG, N0) ? N0 : SDValue();
12868+
return SDValue();
1286412869
}
1286512870

1286612871
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2944,6 +2944,34 @@ def : GCNPat<
29442944
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
29452945
(V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
29462946

2947+
// If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
2948+
let AddedComplexity = 1000 in {
2949+
def : GCNPat<
2950+
(is_canonicalized_1<fcanonicalize> f16:$src),
2951+
(COPY f16:$src)
2952+
>;
2953+
2954+
def : GCNPat<
2955+
(is_canonicalized_1<fcanonicalize> v2f16:$src),
2956+
(COPY v2f16:$src)
2957+
>;
2958+
2959+
def : GCNPat<
2960+
(is_canonicalized_1<fcanonicalize> f32:$src),
2961+
(COPY f32:$src)
2962+
>;
2963+
2964+
def : GCNPat<
2965+
(is_canonicalized_1<fcanonicalize> v2f32:$src),
2966+
(COPY v2f32:$src)
2967+
>;
2968+
2969+
def : GCNPat<
2970+
(is_canonicalized_1<fcanonicalize> f64:$src),
2971+
(COPY f64:$src)
2972+
>;
2973+
}
2974+
29472975
// Prefer selecting to max when legal, but using mul is always valid.
29482976
let AddedComplexity = -5 in {
29492977

@@ -3277,8 +3305,8 @@ def : GCNPat <
32773305

32783306
let AddedComplexity = 5 in {
32793307
def : GCNPat <
3280-
(v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
3281-
(f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
3308+
(v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
3309+
(f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
32823310
(V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
32833311
>;
32843312
}
@@ -3590,6 +3618,17 @@ FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
35903618
DSTCLAMP.NONE, DSTOMOD.NONE)
35913619
>;
35923620

3621+
class
3622+
FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
3623+
SDPatternOperator max_or_min_oneuse> : GCNPat <
3624+
(min_or_max (is_canonicalized_1<fcanonicalize>
3625+
(max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
3626+
(VOP3Mods vt:$src1, i32:$src1_mods))),
3627+
(vt (VOP3Mods vt:$src2, i32:$src2_mods))),
3628+
(minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
3629+
DSTCLAMP.NONE, DSTOMOD.NONE)
3630+
>;
3631+
35933632
let OtherPredicates = [isGFX11Plus] in {
35943633
def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
35953634
def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
@@ -3599,6 +3638,10 @@ def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
35993638
def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
36003639
def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
36013640
def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
3641+
def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
3642+
def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
3643+
def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
3644+
def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
36023645
}
36033646

36043647
let OtherPredicates = [isGFX9Plus] in {
@@ -3612,6 +3655,10 @@ def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fmi
36123655
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
36133656
def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
36143657
def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3658+
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3659+
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
3660+
def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
3661+
def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
36153662
}
36163663

36173664
// Convert a floating-point power of 2 to the integer exponent.

0 commit comments

Comments
 (0)