diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index c15263e0b06f8..f5a8ed9edac16 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -809,6 +809,10 @@ class CombinerHelper { bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info) const; + bool matchRepeatedFPDivisor(MachineInstr &MI, + SmallVector &MatchInfo) const; + void applyRepeatedFPDivisor(SmallVector &MatchInfo) const; + /// Transform G_ADD(x, G_SUB(y, x)) to y. /// Transform G_ADD(G_SUB(y, x), x) to y. bool matchAddSubSameReg(MachineInstr &MI, Register &Src) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index efd88524a159e..1447b293146db 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -211,6 +211,7 @@ def constantfp_matchinfo : GIDefMatchData<"ConstantFP*">; def build_fn_matchinfo : GIDefMatchData<"std::function">; def unsigned_matchinfo: GIDefMatchData<"unsigned">; +def mi_vector_matchinfo : GIDefMatchData<"SmallVector">; def copy_prop : GICombineRule< (defs root:$d), @@ -1327,6 +1328,14 @@ def combine_minmax_nan: GICombineRule< [{ return Helper.matchCombineFMinMaxNaN(*${root}, ${info}); }]), (apply [{ Helper.replaceSingleDefInstWithOperand(*${root}, ${info}); }])>; +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +def fdiv_repeated_divison: GICombineRule< + (defs root:$root, mi_vector_matchinfo:$matchinfo), + (match (G_FDIV $dst, $src1, $src2):$root, + [{ return Helper.matchRepeatedFPDivisor(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyRepeatedFPDivisor(${matchinfo}); }])>; + // Transform (add x, (sub y, x)) -> y // Transform (add (sub y, x), x) -> y def add_sub_reg_frags : GICombinePatFrag< @@ -2051,7 +2060,7 @@ def all_combines : GICombineGroup<[integer_reassoc_combines, trivial_combines, constant_fold_cast_op, fabs_fneg_fold, intdiv_combines, mulh_combines, redundant_neg_operands, and_or_disjoint_mask, fma_combines, fold_binop_into_select, - sub_add_reg, select_to_minmax, + sub_add_reg, select_to_minmax, fdiv_repeated_divison, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, simplify_neg_minmax, combine_concat_vector, sext_trunc, zext_trunc, prefer_sign_combines, shuffle_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index b1e851183de0d..7a2124d48c9ac 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6389,6 +6389,76 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI, return MatchNaN(1) || MatchNaN(2); } +// Combine multiple FDIVs with the same divisor into multiple FMULs by the +// reciprocal. +// E.g., (a / Y; b / Y;) -> (recip = 1.0 / Y; a * recip; b * recip) +bool CombinerHelper::matchRepeatedFPDivisor( + MachineInstr &MI, SmallVector &MatchInfo) const { + assert(MI.getOpcode() == TargetOpcode::G_FDIV); + auto *MF = MI.getMF(); + const TargetOptions &Options = MF->getTarget().Options; + + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + + bool UnsafeMath = Options.UnsafeFPMath; + if (!UnsafeMath && !MI.getFlag(MachineInstr::MIFlag::FmArcp)) + return false; + + // Skip if current node is a reciprocal/fneg-reciprocal. + auto N0CFP = isConstantOrConstantSplatVectorFP(*MRI.getVRegDef(X), MRI); + if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0))) + return false; + + // Exit early if the target does not want this transform or if there can't + // possibly be enough uses of the divisor to make the transform worthwhile. + unsigned MinUses = getTargetLowering().combineRepeatedFPDivisors(); + if (!MinUses) + return false; + + // Find all FDIV users of the same divisor. For the moment we limit all + // instructions to a single BB and use the first Instr in MatchInfo as the + // dominating position. + MatchInfo.push_back(&MI); + for (auto &U : MRI.use_nodbg_instructions(Y)) { + if (&U == &MI || U.getParent() != MI.getParent()) + continue; + if (U.getOpcode() == TargetOpcode::G_FDIV && + U.getOperand(2).getReg() == Y && U.getOperand(1).getReg() != Y) { + // This division is eligible for optimization only if global unsafe math + // is enabled or if this division allows reciprocal formation. + if (UnsafeMath || U.getFlag(MachineInstr::MIFlag::FmArcp)) { + MatchInfo.push_back(&U); + if (dominates(U, *MatchInfo[0])) + std::swap(MatchInfo[0], MatchInfo.back()); + } + } + } + + // Now that we have the actual number of divisor uses, make sure it meets + // the minimum threshold specified by the target. + return MatchInfo.size() >= MinUses; +} + +void CombinerHelper::applyRepeatedFPDivisor( + SmallVector &MatchInfo) const { + // Generate the new div at the position of the first instruction, that we have + // ensured will dominate all other instructions. + Builder.setInsertPt(*MatchInfo[0]->getParent(), MatchInfo[0]); + LLT Ty = MRI.getType(MatchInfo[0]->getOperand(0).getReg()); + auto Div = Builder.buildFDiv(Ty, Builder.buildFConstant(Ty, 1.0), + MatchInfo[0]->getOperand(2).getReg(), + MatchInfo[0]->getFlags()); + + // Replace all found div's with fmul instructions. + for (MachineInstr *MI : MatchInfo) { + Builder.setInsertPt(*MI->getParent(), MI); + Builder.buildFMul(MI->getOperand(0).getReg(), MI->getOperand(1).getReg(), + Div->getOperand(0).getReg(), MI->getFlags()); + MI->eraseFromParent(); + } +} + bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const { assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD"); Register LHS = MI.getOperand(1).getReg(); diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll index d8f7f0a306684..107bc96fae376 100644 --- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll @@ -12,22 +12,14 @@ ; => ; recip = 1.0 / D; a * recip; b * recip; c * recip; define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 { -; CHECK-SD-LABEL: three_fdiv_float: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov s4, #1.00000000 -; CHECK-SD-NEXT: fdiv s4, s4, s0 -; CHECK-SD-NEXT: fmul s0, s1, s4 -; CHECK-SD-NEXT: fmul s1, s2, s4 -; CHECK-SD-NEXT: fmul s2, s3, s4 -; CHECK-SD-NEXT: b foo_3f -; -; CHECK-GI-LABEL: three_fdiv_float: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv s4, s1, s0 -; CHECK-GI-NEXT: fdiv s1, s2, s0 -; CHECK-GI-NEXT: fdiv s2, s3, s0 -; CHECK-GI-NEXT: fmov s0, s4 -; CHECK-GI-NEXT: b foo_3f +; CHECK-LABEL: three_fdiv_float: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov s4, #1.00000000 +; CHECK-NEXT: fdiv s4, s4, s0 +; CHECK-NEXT: fmul s0, s1, s4 +; CHECK-NEXT: fmul s1, s2, s4 +; CHECK-NEXT: fmul s2, s3, s4 +; CHECK-NEXT: b foo_3f %div = fdiv float %a, %D %div1 = fdiv float %b, %D %div2 = fdiv float %c, %D @@ -36,22 +28,14 @@ define void @three_fdiv_float(float %D, float %a, float %b, float %c) #0 { } define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { -; CHECK-SD-LABEL: three_fdiv_double: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov d4, #1.00000000 -; CHECK-SD-NEXT: fdiv d4, d4, d0 -; CHECK-SD-NEXT: fmul d0, d1, d4 -; CHECK-SD-NEXT: fmul d1, d2, d4 -; CHECK-SD-NEXT: fmul d2, d3, d4 -; CHECK-SD-NEXT: b foo_3d -; -; CHECK-GI-LABEL: three_fdiv_double: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv d4, d1, d0 -; CHECK-GI-NEXT: fdiv d1, d2, d0 -; CHECK-GI-NEXT: fdiv d2, d3, d0 -; CHECK-GI-NEXT: fmov d0, d4 -; CHECK-GI-NEXT: b foo_3d +; CHECK-LABEL: three_fdiv_double: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov d4, #1.00000000 +; CHECK-NEXT: fdiv d4, d4, d0 +; CHECK-NEXT: fmul d0, d1, d4 +; CHECK-NEXT: fmul d1, d2, d4 +; CHECK-NEXT: fmul d2, d3, d4 +; CHECK-NEXT: b foo_3d %div = fdiv double %a, %D %div1 = fdiv double %b, %D %div2 = fdiv double %c, %D @@ -60,22 +44,14 @@ define void @three_fdiv_double(double %D, double %a, double %b, double %c) #0 { } define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { -; CHECK-SD-LABEL: three_fdiv_4xfloat: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov v4.4s, #1.00000000 -; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s -; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s -; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s -; CHECK-SD-NEXT: b foo_3_4xf -; -; CHECK-GI-LABEL: three_fdiv_4xfloat: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv v4.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v0.4s -; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v0.4s -; CHECK-GI-NEXT: mov v0.16b, v4.16b -; CHECK-GI-NEXT: b foo_3_4xf +; CHECK-LABEL: three_fdiv_4xfloat: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov v4.4s, #1.00000000 +; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s +; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s +; CHECK-NEXT: b foo_3_4xf %div = fdiv <4 x float> %a, %D %div1 = fdiv <4 x float> %b, %D %div2 = fdiv <4 x float> %c, %D @@ -84,22 +60,14 @@ define void @three_fdiv_4xfloat(<4 x float> %D, <4 x float> %a, <4 x float> %b, } define void @three_fdiv_2xdouble(<2 x double> %D, <2 x double> %a, <2 x double> %b, <2 x double> %c) #0 { -; CHECK-SD-LABEL: three_fdiv_2xdouble: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fmov v4.2d, #1.00000000 -; CHECK-SD-NEXT: fdiv v4.2d, v4.2d, v0.2d -; CHECK-SD-NEXT: fmul v0.2d, v1.2d, v4.2d -; CHECK-SD-NEXT: fmul v1.2d, v2.2d, v4.2d -; CHECK-SD-NEXT: fmul v2.2d, v3.2d, v4.2d -; CHECK-SD-NEXT: b foo_3_2xd -; -; CHECK-GI-LABEL: three_fdiv_2xdouble: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv v4.2d, v1.2d, v0.2d -; CHECK-GI-NEXT: fdiv v1.2d, v2.2d, v0.2d -; CHECK-GI-NEXT: fdiv v2.2d, v3.2d, v0.2d -; CHECK-GI-NEXT: mov v0.16b, v4.16b -; CHECK-GI-NEXT: b foo_3_2xd +; CHECK-LABEL: three_fdiv_2xdouble: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov v4.2d, #1.00000000 +; CHECK-NEXT: fdiv v4.2d, v4.2d, v0.2d +; CHECK-NEXT: fmul v0.2d, v1.2d, v4.2d +; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d +; CHECK-NEXT: fmul v2.2d, v3.2d, v4.2d +; CHECK-NEXT: b foo_3_2xd %div = fdiv <2 x double> %a, %D %div1 = fdiv <2 x double> %b, %D %div2 = fdiv <2 x double> %c, %D @@ -135,26 +103,47 @@ define void @two_fdiv_double(double %D, double %a, double %b) #0 { ret void } -define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { -; CHECK-SD-LABEL: splat_three_fdiv_4xfloat: +define void @four_fdiv_multi_float(float %D, float %a, float %b, float %c) #0 { +; CHECK-SD-LABEL: four_fdiv_multi_float: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-SD-NEXT: fmov v4.4s, #1.00000000 -; CHECK-SD-NEXT: dup v0.4s, v0.s[0] -; CHECK-SD-NEXT: fdiv v4.4s, v4.4s, v0.4s -; CHECK-SD-NEXT: fmul v0.4s, v1.4s, v4.4s -; CHECK-SD-NEXT: fmul v1.4s, v2.4s, v4.4s -; CHECK-SD-NEXT: fmul v2.4s, v3.4s, v4.4s -; CHECK-SD-NEXT: b foo_3_4xf +; CHECK-SD-NEXT: fmov s4, #1.00000000 +; CHECK-SD-NEXT: fdiv s5, s4, s0 +; CHECK-SD-NEXT: fmul s4, s1, s5 +; CHECK-SD-NEXT: fmul s1, s2, s5 +; CHECK-SD-NEXT: fmul s2, s3, s5 +; CHECK-SD-NEXT: fmul s3, s0, s5 +; CHECK-SD-NEXT: fmov s0, s4 +; CHECK-SD-NEXT: b foo_4f ; -; CHECK-GI-LABEL: splat_three_fdiv_4xfloat: +; CHECK-GI-LABEL: four_fdiv_multi_float: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-GI-NEXT: dup v4.4s, v0.s[0] -; CHECK-GI-NEXT: fdiv v0.4s, v1.4s, v4.4s -; CHECK-GI-NEXT: fdiv v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: fdiv v2.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: b foo_3_4xf +; CHECK-GI-NEXT: fmov s4, #1.00000000 +; CHECK-GI-NEXT: fdiv s5, s4, s0 +; CHECK-GI-NEXT: fdiv s4, s0, s0 +; CHECK-GI-NEXT: fmul s0, s1, s5 +; CHECK-GI-NEXT: fmul s1, s2, s5 +; CHECK-GI-NEXT: fmul s2, s3, s5 +; CHECK-GI-NEXT: fmov s3, s4 +; CHECK-GI-NEXT: b foo_4f + %div = fdiv float %a, %D + %div1 = fdiv float %b, %D + %div2 = fdiv float %c, %D + %div3 = fdiv float %D, %D + tail call void @foo_4f(float %div, float %div1, float %div2, float %div3) + ret void +} + +define void @splat_three_fdiv_4xfloat(float %D, <4 x float> %a, <4 x float> %b, <4 x float> %c) #0 { +; CHECK-LABEL: splat_three_fdiv_4xfloat: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: fmov v4.4s, #1.00000000 +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: fdiv v4.4s, v4.4s, v0.4s +; CHECK-NEXT: fmul v0.4s, v1.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v2.4s, v4.4s +; CHECK-NEXT: fmul v2.4s, v3.4s, v4.4s +; CHECK-NEXT: b foo_3_4xf %D.ins = insertelement <4 x float> poison, float %D, i64 0 %splat = shufflevector <4 x float> %D.ins, <4 x float> poison, <4 x i32> zeroinitializer %div = fdiv <4 x float> %a, %splat @@ -256,6 +245,7 @@ entry: } declare void @foo_3f(float, float, float) +declare void @foo_4f(float, float, float, float) declare void @foo_3d(double, double, double) declare void @foo_3_4xf(<4 x float>, <4 x float>, <4 x float>) declare void @foo_3_2xd(<2 x double>, <2 x double>, <2 x double>)