Skip to content

Commit f06d96a

Browse files
committed
[InstCombine] Transform high latency, dependent FSQRT/FDIV into FMUL
The proposed patch, in general, tries to transform the below code sequence: x = 1.0 / sqrt (a); r1 = x * x; // same as 1.0 / a r2 = a / sqrt(a); // same as sqrt (a) TO (If x, r1 and r2 are all used further in the code) tmp1 = 1.0 / a tmp2 = sqrt (a) tmp3 = tmp1 * tmp2 x = tmp3 r1 = tmp1 r2 = tmp2 The transform tries to make high latency sqrt and div operations independent and also saves on one multiplication. The patch was tested with SPEC17 suite with cpu=neoverse-v2. The performance uplift achieved was: 544.nab_r ~4% No other regressions were observed. Also, no compile time differences were observed with the patch. Closes #54652
1 parent f11c0a1 commit f06d96a

File tree

2 files changed

+778
-0
lines changed

2 files changed

+778
-0
lines changed

llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include "InstCombineInternal.h"
1515
#include "llvm/ADT/APInt.h"
16+
#include "llvm/ADT/SmallPtrSet.h"
1617
#include "llvm/ADT/SmallVector.h"
1718
#include "llvm/Analysis/InstructionSimplify.h"
1819
#include "llvm/Analysis/ValueTracking.h"
@@ -666,6 +667,90 @@ Instruction *InstCombinerImpl::foldPowiReassoc(BinaryOperator &I) {
666667
return nullptr;
667668
}
668669

670+
// Check legality for transforming
671+
// x = 1.0/sqrt(a)
672+
// r1 = x * x;
673+
// r2 = a/sqrt(a);
674+
//
675+
// TO
676+
//
677+
// r1 = 1/a
678+
// r2 = sqrt(a)
679+
// x = r1 * r2
680+
// This transform works only when 'a' is known positive.
681+
static bool isFSqrtDivToFMulLegal(Instruction *X,
682+
SmallPtrSetImpl<Instruction *> &R1,
683+
SmallPtrSetImpl<Instruction *> &R2) {
684+
BasicBlock *BBx = X->getParent();
685+
BasicBlock *BBr1 = (*R1.begin())->getParent();
686+
BasicBlock *BBr2 = (*R2.begin())->getParent();
687+
688+
CallInst *FSqrt = cast<CallInst>(X->getOperand(1));
689+
if (!FSqrt->hasAllowReassoc() || !FSqrt->hasNoNaNs() ||
690+
!FSqrt->hasNoSignedZeros() || !FSqrt->hasNoInfs())
691+
return false;
692+
693+
// We change x = 1/sqrt(a) to x = sqrt(a) * 1/a . This change isn't allowed
694+
// by recip fp as it is strictly meant to transform ops of type a/b to
695+
// a * 1/b. So, this can be considered as algebraic rewrite and reassoc flag
696+
// has been used(rather abused)in the past for algebraic rewrites.
697+
if (!X->hasAllowReassoc() || !X->hasAllowReciprocal() || !X->hasNoInfs())
698+
return false;
699+
700+
// Check the constraints on X, R1 and R2 combined.
701+
// fdiv instruction and one of the multiplications must reside in the same
702+
// block. If not, the optimized code may execute more ops than before and
703+
// this may hamper the performance.
704+
if (BBx != BBr1 && BBx != BBr2)
705+
return false;
706+
707+
// Check the constraints on instructions in R1.
708+
if (any_of(R1, [BBr1](Instruction *I) {
709+
// When you have multiple instructions residing in R1 and R2
710+
// respectively, it's difficult to generate combinations of (R1,R2) and
711+
// then check if we have the required pattern. So, for now, just be
712+
// conservative.
713+
return (I->getParent() != BBr1 || !I->hasAllowReassoc());
714+
}))
715+
return false;
716+
717+
// Check the constraints on instructions in R2.
718+
return all_of(R2, [BBr2](Instruction *I) {
719+
// When you have multiple instructions residing in R1 and R2
720+
// respectively, it's difficult to generate combination of (R1,R2) and
721+
// then check if we have the required pattern. So, for now, just be
722+
// conservative.
723+
return (I->getParent() == BBr2 && I->hasAllowReassoc());
724+
});
725+
}
726+
727+
// If we have the following pattern,
728+
// X = 1.0/sqrt(a)
729+
// R1 = X * X
730+
// R2 = a/sqrt(a)
731+
// then this method collects all the instructions that match R1 and R2.
732+
static bool getFSqrtDivOptPattern(Instruction *Div,
733+
SmallPtrSetImpl<Instruction *> &R1,
734+
SmallPtrSetImpl<Instruction *> &R2) {
735+
Value *A;
736+
if (match(Div, m_FDiv(m_FPOne(), m_Sqrt(m_Value(A)))) ||
737+
match(Div, m_FDiv(m_SpecificFP(-1.0), m_Sqrt(m_Value(A))))) {
738+
for (User *U : Div->users()) {
739+
Instruction *I = cast<Instruction>(U);
740+
if (match(I, m_FMul(m_Specific(Div), m_Specific(Div))))
741+
R1.insert(I);
742+
}
743+
744+
CallInst *CI = cast<CallInst>(Div->getOperand(1));
745+
for (User *U : CI->users()) {
746+
Instruction *I = cast<Instruction>(U);
747+
if (match(I, m_FDiv(m_Specific(A), m_Sqrt(m_Specific(A)))))
748+
R2.insert(I);
749+
}
750+
}
751+
return !R1.empty() && !R2.empty();
752+
}
753+
669754
Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
670755
Value *Op0 = I.getOperand(0);
671756
Value *Op1 = I.getOperand(1);
@@ -1864,6 +1949,68 @@ static Instruction *foldFDivSqrtDivisor(BinaryOperator &I,
18641949
return BinaryOperator::CreateFMulFMF(Op0, NewSqrt, &I);
18651950
}
18661951

1952+
// Change
1953+
// X = 1/sqrt(a)
1954+
// R1 = X * X
1955+
// R2 = a * X
1956+
//
1957+
// TO
1958+
//
1959+
// FDiv = 1/a
1960+
// FSqrt = sqrt(a)
1961+
// FMul = FDiv * FSqrt
1962+
// Replace Uses Of R1 With FDiv
1963+
// Replace Uses Of R2 With FSqrt
1964+
// Replace Uses Of X With FMul
1965+
static Instruction *
1966+
convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
1967+
const SmallPtrSetImpl<Instruction *> &R1,
1968+
const SmallPtrSetImpl<Instruction *> &R2,
1969+
InstCombiner::BuilderTy &B, InstCombinerImpl *IC) {
1970+
1971+
B.SetInsertPoint(X);
1972+
1973+
// Every instance of R1 may have different fpmath metadata and fpmath flags.
1974+
// We try to preserve them by having separate fdiv instruction per R1
1975+
// instance.
1976+
Value *SqrtOp = CI->getArgOperand(0);
1977+
Instruction *FDiv;
1978+
1979+
for (Instruction *I : R1) {
1980+
FDiv = cast<Instruction>(
1981+
B.CreateFDiv(ConstantFP::get(X->getType(), 1.0), SqrtOp));
1982+
FDiv->copyMetadata(*I);
1983+
FDiv->copyFastMathFlags(I);
1984+
IC->replaceInstUsesWith(*I, FDiv);
1985+
IC->eraseInstFromFunction(*I);
1986+
}
1987+
1988+
// Although, by value, FSqrt = CI , every instance of R2 may have different
1989+
// fpmath metadata and fpmath flags. We try to preserve them by cloning the
1990+
// call instruction per R2 instance.
1991+
CallInst *FSqrt;
1992+
for (Instruction *I : R2) {
1993+
FSqrt = cast<CallInst>(CI->clone());
1994+
FSqrt->insertBefore(CI);
1995+
FSqrt->copyFastMathFlags(I);
1996+
FSqrt->copyMetadata(*I);
1997+
IC->replaceInstUsesWith(*I, FSqrt);
1998+
IC->eraseInstFromFunction(*I);
1999+
}
2000+
2001+
Instruction *FMul;
2002+
// If X = -1/sqrt(a) initially,then FMul = -(FDiv * FSqrt)
2003+
if (match(X, m_FDiv(m_SpecificFP(-1.0), m_Specific(CI)))) {
2004+
Value *Mul = B.CreateFMul(FDiv, FSqrt);
2005+
FMul = cast<Instruction>(B.CreateFNegFMF(Mul, X));
2006+
} else
2007+
FMul = cast<Instruction>(B.CreateFMulFMF(FDiv, FSqrt, X));
2008+
FMul->copyMetadata(*X);
2009+
2010+
IC->replaceInstUsesWith(*X, FMul);
2011+
return IC->eraseInstFromFunction(*X);
2012+
}
2013+
18672014
Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
18682015
Module *M = I.getModule();
18692016

@@ -1888,6 +2035,24 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
18882035
return R;
18892036

18902037
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
2038+
2039+
// Convert
2040+
// x = 1.0/sqrt(a)
2041+
// r1 = x * x;
2042+
// r2 = a/sqrt(a);
2043+
//
2044+
// TO
2045+
//
2046+
// r1 = 1/a
2047+
// r2 = sqrt(a)
2048+
// x = r1 * r2
2049+
SmallPtrSet<Instruction *, 2> R1, R2;
2050+
if (getFSqrtDivOptPattern(&I, R1, R2) && isFSqrtDivToFMulLegal(&I, R1, R2)) {
2051+
CallInst *CI = cast<CallInst>(I.getOperand(1));
2052+
if (Instruction *D = convertFSqrtDivIntoFMul(CI, &I, R1, R2, Builder, this))
2053+
return D;
2054+
}
2055+
18912056
if (isa<Constant>(Op0))
18922057
if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
18932058
if (Instruction *R = FoldOpIntoSelect(I, SI))

0 commit comments

Comments
 (0)