Skip to content

[DAG] Fold fdiv X, c2 -> fmul X, 1/c2 without AllowReciprocal if exact #93882

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/include/llvm/ADT/APFloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,14 @@ class APFloat : public APFloatBase {
return Val;
}

/// Factory for Positive and Negative One.
///
/// \param Negative True iff the number should be negative.
static APFloat getOne(const fltSemantics &Sem, bool Negative = false) {
APFloat Val(Sem, Negative ? -1 : 1);
return Val;
Comment on lines +971 to +972
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit

Suggested change
APFloat Val(Sem, Negative ? -1 : 1);
return Val;
return APFloat(Sem, Negative ? -1.0 : 1.0);

?

}

/// Factory for Positive and Negative Infinity.
///
/// \param Negative True iff the number should be negative.
Expand Down
41 changes: 22 additions & 19 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17271,26 +17271,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
if (SDValue V = combineRepeatedFPDivisors(N))
return V;

if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT));
}
// fold (fdiv X, c2) -> (fmul X, 1/c2) if there is no loss in precision, or
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't do this in instcombine already, we probably should also do this there

// the loss is acceptable with AllowReciprocal.
if (auto *N1CFP = isConstOrConstSplatFP(N1, true)) {
// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip = APFloat::getOne(N1APF.getSemantics());
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if (((st == APFloat::opOK && !Recip.isDenormal()) ||
(st == APFloat::opInexact &&
(Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT));
}

if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AArch64/fcvt-fixed.ll
Original file line number Diff line number Diff line change
Expand Up @@ -412,10 +412,10 @@ define half @scvtf_f16_i32_7(i32 %int) {
; CHECK-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -432,10 +432,10 @@ define half @scvtf_f16_i32_15(i32 %int) {
; CHECK-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -452,10 +452,10 @@ define half @scvtf_f16_i64_7(i64 %long) {
; CHECK-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -472,10 +472,10 @@ define half @scvtf_f16_i64_15(i64 %long) {
; CHECK-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand Down Expand Up @@ -574,10 +574,10 @@ define half @ucvtf_f16_i32_7(i32 %int) {
; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -594,10 +594,10 @@ define half @ucvtf_f16_i32_15(i32 %int) {
; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -614,10 +614,10 @@ define half @ucvtf_f16_i64_7(i64 %long) {
; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -634,10 +634,10 @@ define half @ucvtf_f16_i64_15(i64 %long) {
; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand Down
23 changes: 11 additions & 12 deletions llvm/test/CodeGen/AArch64/fdiv-const.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
define float @divf32_2(float %a) nounwind {
; CHECK-LABEL: divf32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, #2.00000000
; CHECK-NEXT: fdiv s0, s0, s1
; CHECK-NEXT: fmov s1, #0.50000000
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%r = fdiv float %a, 2.0
ret float %r
Expand Down Expand Up @@ -46,8 +46,8 @@ define float @divf32_p75_arcp(float %a) nounwind {
define half @divf16_2(half %a) nounwind {
; CHECK-LABEL: divf16_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov h1, #2.00000000
; CHECK-NEXT: fdiv h0, h0, h1
; CHECK-NEXT: fmov h1, #0.50000000
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
%r = fdiv half %a, 2.0
ret half %r
Expand All @@ -67,9 +67,9 @@ define half @divf16_32768(half %a) nounwind {
define half @divf16_32768_arcp(half %a) nounwind {
; CHECK-LABEL: divf16_32768_arcp:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #512 // =0x200
; CHECK-NEXT: mov w8, #30720 // =0x7800
; CHECK-NEXT: fmov h1, w8
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: fdiv h0, h0, h1
; CHECK-NEXT: ret
%r = fdiv arcp half %a, 32768.0
ret half %r
Expand All @@ -78,8 +78,8 @@ define half @divf16_32768_arcp(half %a) nounwind {
define double @divf64_2(double %a) nounwind {
; CHECK-LABEL: divf64_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d1, #2.00000000
; CHECK-NEXT: fdiv d0, d0, d1
; CHECK-NEXT: fmov d1, #0.50000000
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
%r = fdiv double %a, 2.0
ret double %r
Expand All @@ -88,8 +88,8 @@ define double @divf64_2(double %a) nounwind {
define <4 x float> @divv4f32_2(<4 x float> %a) nounwind {
; CHECK-LABEL: divv4f32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #64, lsl #24
; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #63, lsl #24
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%r = fdiv <4 x float> %a, <float 2.0, float 2.0, float 2.0, float 2.0>
ret <4 x float> %r
Expand Down Expand Up @@ -141,9 +141,8 @@ define <4 x float> @divv4f32_24816(<4 x float> %a) nounwind {
define <vscale x 4 x float> @divnxv4f32_2(<vscale x 4 x float> %a) nounwind {
; CHECK-LABEL: divnxv4f32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z1.s, #2.00000000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, #0.5
; CHECK-NEXT: ret
%r = fdiv <vscale x 4 x float> %a, splat (float 2.0)
ret <vscale x 4 x float> %r
Expand Down
Loading
Loading