-
Notifications
You must be signed in to change notification settings - Fork 13.5k
Update foldFMulReassoc to respect absent fast-math flags #88589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This change updates a few of the transformations in foldFMulReassoc to respect absent fast-math flags in a few cases where fmul and fdiv instructions were being folded but the code was not checking for fast-math flags on the fdiv instruction and was transferring flags to the folded instruction that were not present on the original fdiv instruction. This fixes llvm#82857
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-llvm-transforms Author: Andy Kaylor (andykaylor) ChangesThis change updates a few of the transformations in foldFMulReassoc to This fixes #82857 Full diff: https://github.com/llvm/llvm-project/pull/88589.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 4dc1319f1c437f..96485c40484007 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -636,26 +636,43 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
// expression.
if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
Constant *C1;
- if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
+ if (match(Op0,
+ m_AllowReassoc(m_OneUse(m_FDiv(m_Constant(C1), m_Value(X)))))) {
// (C1 / X) * C --> (C * C1) / X
Constant *CC1 =
ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL);
- if (CC1 && CC1->isNormalFP())
- return BinaryOperator::CreateFDivFMF(CC1, X, &I);
+ if (CC1 && CC1->isNormalFP()) {
+ // Preserve only fast-math flags that were set on both of the original
+ // instructions
+ auto *NewDiv = BinaryOperator::CreateFDivFMF(CC1, X, &I);
+ NewDiv->andIRFlags(Op0);
+ return NewDiv;
+ }
}
- if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+ if (match(Op0, m_AllowReassoc(m_FDiv(m_Value(X), m_Constant(C1))))) {
+ // FIXME: This seems like it should also be checking for arcp
// (X / C1) * C --> X * (C / C1)
Constant *CDivC1 =
ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C1, DL);
- if (CDivC1 && CDivC1->isNormalFP())
- return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+ if (CDivC1 && CDivC1->isNormalFP()) {
+ // Preserve only fast-math flags that were set on both of the original
+ // instructions
+ auto *NewMul = BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+ NewMul->andIRFlags(Op0);
+ return NewMul;
+ }
// If the constant was a denormal, try reassociating differently.
// (X / C1) * C --> X / (C1 / C)
Constant *C1DivC =
ConstantFoldBinaryOpOperands(Instruction::FDiv, C1, C, DL);
- if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP())
- return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
+ if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP()) {
+ // Preserve only fast-math flags that were set on both of the original
+ // instructions
+ auto *NewDiv = BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
+ NewDiv->andIRFlags(Op0);
+ return NewDiv;
+ }
}
// We do not need to match 'fadd C, X' and 'fsub X, C' because they are
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index f6435f0032891e..b541f455939c7e 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -652,12 +652,49 @@ define float @fdiv_constant_numerator_fmul(float %x) {
; CHECK-LABEL: @fdiv_constant_numerator_fmul(
; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
; CHECK-NEXT: ret float [[T3]]
+;
+ %t1 = fdiv reassoc float 2.0e+3, %x
+ %t3 = fmul reassoc float %t1, 6.0e+3
+ ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with mixed fast-math flags
+
+define float @fdiv_constant_numerator_fmul_mixed(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_mixed(
+; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT: ret float [[T3]]
+;
+ %t1 = fdiv reassoc float 2.0e+3, %x
+ %t3 = fmul fast float %t1, 6.0e+3
+ ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with full fast-math flags
+
+define float @fdiv_constant_numerator_fmul_fast(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_fast(
+; CHECK-NEXT: [[T3:%.*]] = fdiv fast float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT: ret float [[T3]]
+;
+ %t1 = fdiv fast float 2.0e+3, %x
+ %t3 = fmul fast float %t1, 6.0e+3
+ ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with no fast-math flags on the fdiv
+
+define float @fdiv_constant_numerator_fmul_precdiv(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_precdiv(
+; CHECK-NEXT: [[T4:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT: ret float [[T4]]
;
%t1 = fdiv float 2.0e+3, %x
%t3 = fmul reassoc float %t1, 6.0e+3
ret float %t3
}
+
; C1/X * C2 => (C1*C2) / X is disabled if C1/X has multiple uses
@fmul2_external = external global float
@@ -679,7 +716,8 @@ define float @fdiv_constant_numerator_fmul_extra_use(float %x) {
define float @fdiv_constant_denominator_fmul(float %x) {
; CHECK-LABEL: @fdiv_constant_denominator_fmul(
-; CHECK-NEXT: [[T3:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
+; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], 6.000000e+03
+; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float [[TMP1]], 2.000000e+03
; CHECK-NEXT: ret float [[T3]]
;
%t1 = fdiv float %x, 2.0e+3
@@ -692,7 +730,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec(<4 x float> %x) {
; CHECK-NEXT: [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 1.000000e+00>
; CHECK-NEXT: ret <4 x float> [[T3]]
;
- %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+ %t1 = fdiv reassoc <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
%t3 = fmul reassoc <4 x float> %t1, <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3>
ret <4 x float> %t3
}
@@ -705,7 +743,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec_constexpr(<4 x float> %x)
; CHECK-NEXT: ret <4 x float> [[T3]]
;
%constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3, float undef> to i160) to i128) to <4 x float>
- %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+ %t1 = fdiv reassoc <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
%t3 = fmul reassoc <4 x float> %t1, %constExprMul
ret <4 x float> %t3
}
@@ -745,7 +783,8 @@ define float @fdiv_constant_denominator_fmul_denorm(float %x) {
define float @fdiv_constant_denominator_fmul_denorm_try_harder(float %x) {
; CHECK-LABEL: @fdiv_constant_denominator_fmul_denorm_try_harder(
-; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float [[X:%.*]], 0x47E8000000000000
+; CHECK-NEXT: [[TMP1:%.*]] = fmul reassoc float [[X:%.*]], 0x3810000000000000
+; CHECK-NEXT: [[T3:%.*]] = fdiv reassoc float [[TMP1]], 3.000000e+00
; CHECK-NEXT: ret float [[T3]]
;
%t1 = fdiv float %x, 3.0
@@ -868,7 +907,8 @@ define float @fmul_fadd_distribute_extra_use(float %x) {
define double @fmul_fadd_fdiv_distribute2(double %x) {
; CHECK-LABEL: @fmul_fadd_fdiv_distribute2(
-; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double [[X:%.*]], 0x7FE8000000000000
+; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc double [[X:%.*]], 0x10000000000000
+; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double [[TMP2]], 3.000000e+00
; CHECK-NEXT: [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
; CHECK-NEXT: ret double [[T3]]
;
@@ -883,7 +923,8 @@ define double @fmul_fadd_fdiv_distribute2(double %x) {
define double @fmul_fadd_fdiv_distribute3(double %x) {
; CHECK-LABEL: @fmul_fadd_fdiv_distribute3(
-; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double [[X:%.*]], 0x7FE8000000000000
+; CHECK-NEXT: [[TMP2:%.*]] = fmul reassoc double [[X:%.*]], 0x10000000000000
+; CHECK-NEXT: [[TMP1:%.*]] = fdiv reassoc double [[TMP2]], 3.000000e+00
; CHECK-NEXT: [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
; CHECK-NEXT: ret double [[T3]]
;
|
This PR is probably opening a huge can of worms. I intend to post an RFC to discuss the general topic of requiring fast-math flag checks on all instructions involved in a transformation. There seem to be a lot of places where we aren't bothering with this, and many existing tests don't have uniform FMF on the instructions being transformed. Getting this clean everywhere may be a huge project. The transformation this is targeting is the one from #82857: In the problem case, fast-math flags are set on the multiplication operation, but not on the division, as a result of inlining a function that didn't have fast-math enabled. The folded operation is assigned the fast-math flags from the multiplication. Issue 82857 arrived at this state using |
define float @fdiv_constant_numerator_fmul_precdiv(float %x) { | ||
; CHECK-LABEL: @fdiv_constant_numerator_fmul_precdiv( | ||
; CHECK-NEXT: [[T4:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]] | ||
; CHECK-NEXT: ret float [[T4]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confused by this test, wasn't it supposed to not fold due to missing reassoc on fdiv?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it was being folded by an additional case that handled non-constant fdiv and fmul. I've fixed that now. It required updating the input IR for a lot of other cases. In a couple cases adding the extra flags triggered a different transformation than we were previously testing for.
|
||
define float @fdiv_constant_numerator_fmul_precdiv(float %x) { | ||
; CHECK-LABEL: @fdiv_constant_numerator_fmul_precdiv( | ||
; CHECK-NEXT: [[T4:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This case shouldn't have been folded. It turns out one of the transformations I didn't update handles this is the ones I did update let it pass.
@@ -636,26 +636,43 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) { | |||
// expression. | |||
if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) { | |||
Constant *C1; | |||
if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) { | |||
if (match(Op0, | |||
m_AllowReassoc(m_OneUse(m_FDiv(m_Constant(C1), m_Value(X)))))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still think the flags should be some kind of template argument to the m_FDiv
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about this a bit. One possibility would be to add something like
FastMastFlags RequiredFMF = FastMathFlags::AllowReassoc;
...
m_fmf_FDiv(RequiredFMF, m_Value(X), m_Value(Y))
That's no less verbose than the current method if you only want to check reassoc, but it would allow us to check other flags too. Is this what you had in mind?
The thing I'm not happy about with either the current implementation or the suggestion above is that it isn't well suited to retrieving the fast-math flags for any given operation, which is needed when combining operations. You can see this in the update I posted to handle the (X / Y) * Z --> (X * Z) / Y
case. In that code, we're doing a commutative match on the fmul operands, and I had to deduce which operand was the fdiv it matched and extract the flags, Maybe something like this would be better?
FastMastFlags DivFMF;
...
m_fmf_FDiv(m_reassoc(m_FMF(DivFMF)), m_Value(X), m_Value(Y))
The m_FMF() would retrieve the actual flags used, and m_reassoc() applies a check during the matching.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, something like that would be better. It shouldn't be the dissociated matcher it is today
auto *NewDiv = BinaryOperator::CreateFDivFMF(CC1, X, &I); | ||
NewDiv->andIRFlags(Op0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This API is somewhat unfortunate. Is there a way to use the FastMathFlagGuard to avoid repeating this 3x? Probably not given CreateFDivFMF wants to take flags from an existing instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it would be better to have a version of CreateFDivFMF and similar that took a FastMathFlags argument rather than an instruction to copy them from. Then we could have utility functions to create an intersection of fast-math flags from two or more FPMathOperator objects. I'll see what I can do with that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I refactored the handling a bit to use a FastMathFlagGuard
on the Builder object and added new overloads of BinaryOperator::Create*FMF()
to take the flags directly. I thought about possibly introducing a new class that would provide a façade for IRBuilder
and the BinaryOperator::Create*FMF()
methods to combine the FMF handling into a single place. That felt like overkill, but maybe this pattern occurs often enough to justify it.
The thing I didn't like here is the fact that I needed to mix instructions created with IRBuilder
(because intermediate instructions needed to be inserted in the IR) and instructions created directly with BinaryOperator
's static methods (because InstCombine
expects me to return an instruction that hasn't been inserted. This feels kind of clunky.
It also bothers me a bit that IRBuilder::CreateFDivFMF()
and similar don't follow the insertion rules for the rest of the IRBuilder
routine. I think that the insert point will already be set to the instruction I am replacing in the code I modified, but if I wanted to replace code that was using IRBuilder::CreateFDivFMF()
to insert instructions somewhere else, I'd need to manually change the insert point to use FastMathFlagGuard
the way I did.
Here's what I was thinking for the wrapper to combine FMF handling between IRBuilder
and BinaryOperator
calls:
class FMFBuilder {
public:
FMFHelper(IRBuilder B, FastMathFlags F) : Builder(B), FMFGuard(B), FMF(F) {
Builder.setFastMathFlags(FMF);
}
// Functions that wrap IRBuilder calls get the fast math flags from IRBuilder, as set by this wrapper class
Value *createFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr) {
return Builder.CreateFDiv(L, R, Name, FPMD);
}
// Functions that wrap BinaryOperator static calls need the FMF we have stored
// The "Inst" suffix indicate that we want to return an instruction
BinaryOperator *CreateFDivInst(Value *V1, Value *V2, const Twine &Name = "") {
return BinaryOperator::CreateFDivFMF(V1, V2, FMF, Name);
}
private:
IRBuilder &Builder;
FastMathFlagGuard FMFGuard;
FastMathFlags FMF;
}
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
✅ With the latest revision this PR passed the C/C++ code formatter. |
static BinaryOperator * | ||
CreateWithFMF(BinaryOps Opc, Value *V1, Value *V2, FastMathFlags FMF, | ||
const Twine &Name = "", Instruction *InsertBefore = nullptr) { | ||
BinaryOperator *BO = Create(Opc, V1, V2, Name, InsertBefore); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you added a new operand/overload to Create, can you avoid the new include?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so. The include is needed because I'm passing FastMathFlags by value. It gets passed as a single integer, so it doesn't really make sense to pass by pointer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It kind of would be nice if we could get a matcher that checked the FMF flags of everything in the chain of instructions, but I don't think the matcher infrastructure is set up in a way that makes such a thing possible.
But overall, this is something desperately needed in more of our fast-math optimizatoins.
} | ||
if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) { | ||
// FIXME: This seems like it should also be checking for arcp |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose the question here is if the transformation looks like (A / B) * C => (A * recip(B)) * C => A * (recip(B) * C) => A * (C / B)
(with the creation/deletion of the internal recip
operation requiring arcp
), or if it's just a straightforward application of the associativity law. We're not entirely consistent with regards to whether or not reassociation of division requires both reassoc
and arcp
, although requiring arcp
does seem to be the plurality opinion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My thinking is that division is not associative, and you can only apply the associative property if you replace division with multiplication by the reciprocal. I agree that we're not consistent about this, but I don't see how skipping the arcp check can be justified.
FWIW, gcc thinks this requires reciprocal math: https://godbolt.org/z/T7xTTshbf
I thought about that, but I don't think we necessarily want to check everything in the chain. Consider this expression:
This is used for the So, maybe something like this?
Then if something being matched is an FPMathOperator we check fast math flags, but if it isn't explicitly so in the match expression we don't. Is that what you had in mind? |
This change updates a few of the transformations in foldFMulReassoc to
respect absent fast-math flags in a few cases where fmul and fdiv
instructions were being folded but the code was not checking for
fast-math flags on the fdiv instruction and was transferring flags to
the folded instruction that were not present on the original fdiv
instruction.
This fixes #82857