Skip to content

Commit 83b01aa

Browse files
hazzlimdavid-arm
andauthored
[LoopIdiom] Support 'shift until less-than' idiom (#95002)
The current loop idiom code for recognising and inserting a CTLZ intrinsic does not support loops where the loopback control is based on an unsigned less-than condition. This patch adds support for recognising these loops and inserting a CTLZ intrinsic. Fixes the missed optimization cases in #51064 --------- Co-authored-by: David Sherwood <[email protected]>
1 parent 4c47b41 commit 83b01aa

File tree

2 files changed

+1033
-36
lines changed

2 files changed

+1033
-36
lines changed

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Lines changed: 255 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,19 @@ class LoopIdiomRecognize {
231231
bool recognizePopcount();
232232
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
233233
PHINode *CntPhi, Value *Var);
234+
bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
235+
bool ZeroCheck, size_t CanonicalSize);
236+
bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
237+
Instruction *DefX, PHINode *CntPhi,
238+
Instruction *CntInst);
234239
bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
240+
bool recognizeShiftUntilLessThan();
235241
void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
236242
Instruction *CntInst, PHINode *CntPhi,
237243
Value *Var, Instruction *DefX,
238244
const DebugLoc &DL, bool ZeroCheck,
239-
bool IsCntPhiUsedOutsideLoop);
245+
bool IsCntPhiUsedOutsideLoop,
246+
bool InsertSub = false);
240247

241248
bool recognizeShiftUntilBitTest();
242249
bool recognizeShiftUntilZero();
@@ -1482,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
14821489
<< CurLoop->getHeader()->getName() << "\n");
14831490

14841491
return recognizePopcount() || recognizeAndInsertFFS() ||
1485-
recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
1492+
recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
1493+
recognizeShiftUntilLessThan();
14861494
}
14871495

14881496
/// Check if the given conditional branch is based on the comparison between
@@ -1517,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
15171525
return nullptr;
15181526
}
15191527

1528+
/// Check if the given conditional branch is based on an unsigned less-than
1529+
/// comparison between a variable and a constant, and if the comparison is false
1530+
/// the control yields to the loop entry. If the branch matches the behaviour,
1531+
/// the variable involved in the comparison is returned.
1532+
static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
1533+
uint64_t &Threshold) {
1534+
if (!BI || !BI->isConditional())
1535+
return nullptr;
1536+
1537+
ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
1538+
if (!Cond)
1539+
return nullptr;
1540+
1541+
ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
1542+
if (!CmpConst)
1543+
return nullptr;
1544+
1545+
BasicBlock *FalseSucc = BI->getSuccessor(1);
1546+
ICmpInst::Predicate Pred = Cond->getPredicate();
1547+
1548+
if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1549+
Threshold = CmpConst->getZExtValue();
1550+
return Cond->getOperand(0);
1551+
}
1552+
1553+
return nullptr;
1554+
}
1555+
15201556
// Check if the recurrence variable `VarX` is in the right form to create
15211557
// the idiom. Returns the value coerced to a PHINode if so.
15221558
static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
@@ -1528,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
15281564
return nullptr;
15291565
}
15301566

1567+
/// Return true if the idiom is detected in the loop.
1568+
///
1569+
/// Additionally:
1570+
/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1571+
/// or nullptr if there is no such.
1572+
/// 2) \p CntPhi is set to the corresponding phi node
1573+
/// or nullptr if there is no such.
1574+
/// 3) \p InitX is set to the value whose CTLZ could be used.
1575+
/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1576+
/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1577+
/// comparison.
1578+
///
1579+
/// The core idiom we are trying to detect is:
1580+
/// \code
1581+
/// if (x0 < 2)
1582+
/// goto loop-exit // the precondition of the loop
1583+
/// cnt0 = init-val
1584+
/// do {
1585+
/// x = phi (x0, x.next); //PhiX
1586+
/// cnt = phi (cnt0, cnt.next)
1587+
///
1588+
/// cnt.next = cnt + 1;
1589+
/// ...
1590+
/// x.next = x >> 1; // DefX
1591+
/// } while (x >= 4)
1592+
/// loop-exit:
1593+
/// \endcode
1594+
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
1595+
Intrinsic::ID &IntrinID,
1596+
Value *&InitX, Instruction *&CntInst,
1597+
PHINode *&CntPhi, Instruction *&DefX,
1598+
uint64_t &Threshold) {
1599+
BasicBlock *LoopEntry;
1600+
1601+
DefX = nullptr;
1602+
CntInst = nullptr;
1603+
CntPhi = nullptr;
1604+
LoopEntry = *(CurLoop->block_begin());
1605+
1606+
// step 1: Check if the loop-back branch is in desirable form.
1607+
if (Value *T = matchShiftULTCondition(
1608+
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
1609+
Threshold))
1610+
DefX = dyn_cast<Instruction>(T);
1611+
else
1612+
return false;
1613+
1614+
// step 2: Check the recurrence of variable X
1615+
if (!DefX || !isa<PHINode>(DefX))
1616+
return false;
1617+
1618+
PHINode *VarPhi = cast<PHINode>(DefX);
1619+
int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
1620+
if (Idx == -1)
1621+
return false;
1622+
1623+
DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
1624+
if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
1625+
return false;
1626+
1627+
// step 3: detect instructions corresponding to "x.next = x >> 1"
1628+
if (DefX->getOpcode() != Instruction::LShr)
1629+
return false;
1630+
1631+
IntrinID = Intrinsic::ctlz;
1632+
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
1633+
if (!Shft || !Shft->isOne())
1634+
return false;
1635+
1636+
InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());
1637+
1638+
// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1639+
// or cnt.next = cnt + -1.
1640+
// TODO: We can skip the step. If loop trip count is known (CTLZ),
1641+
// then all uses of "cnt.next" could be optimized to the trip count
1642+
// plus "cnt0". Currently it is not optimized.
1643+
// This step could be used to detect POPCNT instruction:
1644+
// cnt.next = cnt + (x.next & 1)
1645+
for (Instruction &Inst : llvm::make_range(
1646+
LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
1647+
if (Inst.getOpcode() != Instruction::Add)
1648+
continue;
1649+
1650+
ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
1651+
if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
1652+
continue;
1653+
1654+
PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
1655+
if (!Phi)
1656+
continue;
1657+
1658+
CntInst = &Inst;
1659+
CntPhi = Phi;
1660+
break;
1661+
}
1662+
if (!CntInst)
1663+
return false;
1664+
1665+
return true;
1666+
}
1667+
15311668
/// Return true iff the idiom is detected in the loop.
15321669
///
15331670
/// Additionally:
@@ -1756,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
17561893
return true;
17571894
}
17581895

1759-
/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1760-
/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1761-
/// trip count returns true; otherwise, returns false.
1762-
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1763-
// Give up if the loop has multiple blocks or multiple backedges.
1764-
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1765-
return false;
1896+
// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1897+
// profitable if we delete the loop.
1898+
bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
1899+
Value *InitX, bool ZeroCheck,
1900+
size_t CanonicalSize) {
1901+
const Value *Args[] = {InitX,
1902+
ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
17661903

1767-
Intrinsic::ID IntrinID;
1768-
Value *InitX;
1769-
Instruction *DefX = nullptr;
1770-
PHINode *CntPhi = nullptr;
1771-
Instruction *CntInst = nullptr;
1772-
// Help decide if transformation is profitable. For ShiftUntilZero idiom,
1773-
// this is always 6.
1774-
size_t IdiomCanonicalSize = 6;
1904+
// @llvm.dbg doesn't count as they have no semantic effect.
1905+
auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1906+
uint32_t HeaderSize =
1907+
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
17751908

1776-
if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
1777-
CntInst, CntPhi, DefX))
1909+
IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1910+
InstructionCost Cost = TTI->getIntrinsicInstrCost(
1911+
Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1912+
if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
17781913
return false;
17791914

1915+
return true;
1916+
}
1917+
1918+
/// Convert CTLZ / CTTZ idiom loop into countable loop.
1919+
/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
1920+
/// returns false.
1921+
bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
1922+
Value *InitX, Instruction *DefX,
1923+
PHINode *CntPhi,
1924+
Instruction *CntInst) {
17801925
bool IsCntPhiUsedOutsideLoop = false;
17811926
for (User *U : CntPhi->users())
17821927
if (!CurLoop->contains(cast<Instruction>(U))) {
@@ -1818,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
18181963
ZeroCheck = true;
18191964
}
18201965

1821-
// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
1822-
// profitable if we delete the loop.
1823-
1824-
// the loop has only 6 instructions:
1966+
// FFS idiom loop has only 6 instructions:
18251967
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
18261968
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
18271969
// %shr = ashr %n.addr.0, 1
18281970
// %tobool = icmp eq %shr, 0
18291971
// %inc = add nsw %i.0, 1
18301972
// br i1 %tobool
1973+
size_t IdiomCanonicalSize = 6;
1974+
if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
1975+
return false;
18311976

1832-
const Value *Args[] = {InitX,
1833-
ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
1977+
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
1978+
DefX->getDebugLoc(), ZeroCheck,
1979+
IsCntPhiUsedOutsideLoop);
1980+
return true;
1981+
}
18341982

1835-
// @llvm.dbg doesn't count as they have no semantic effect.
1836-
auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
1837-
uint32_t HeaderSize =
1838-
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
1983+
/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
1984+
/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
1985+
/// trip count returns true; otherwise, returns false.
1986+
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
1987+
// Give up if the loop has multiple blocks or multiple backedges.
1988+
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
1989+
return false;
18391990

1840-
IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
1841-
InstructionCost Cost =
1842-
TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
1843-
if (HeaderSize != IdiomCanonicalSize &&
1844-
Cost > TargetTransformInfo::TCC_Basic)
1991+
Intrinsic::ID IntrinID;
1992+
Value *InitX;
1993+
Instruction *DefX = nullptr;
1994+
PHINode *CntPhi = nullptr;
1995+
Instruction *CntInst = nullptr;
1996+
1997+
if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
1998+
DefX))
1999+
return false;
2000+
2001+
return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2002+
}
2003+
2004+
bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2005+
// Give up if the loop has multiple blocks or multiple backedges.
2006+
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
2007+
return false;
2008+
2009+
Intrinsic::ID IntrinID;
2010+
Value *InitX;
2011+
Instruction *DefX = nullptr;
2012+
PHINode *CntPhi = nullptr;
2013+
Instruction *CntInst = nullptr;
2014+
2015+
uint64_t LoopThreshold;
2016+
if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
2017+
CntPhi, DefX, LoopThreshold))
2018+
return false;
2019+
2020+
if (LoopThreshold == 2) {
2021+
// Treat as regular FFS.
2022+
return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2023+
}
2024+
2025+
// Look for Floor Log2 Idiom.
2026+
if (LoopThreshold != 4)
2027+
return false;
2028+
2029+
// Abort if CntPhi is used outside of the loop.
2030+
for (User *U : CntPhi->users())
2031+
if (!CurLoop->contains(cast<Instruction>(U)))
2032+
return false;
2033+
2034+
// It is safe to assume Preheader exist as it was checked in
2035+
// parent function RunOnLoop.
2036+
BasicBlock *PH = CurLoop->getLoopPreheader();
2037+
auto *PreCondBB = PH->getSinglePredecessor();
2038+
if (!PreCondBB)
2039+
return false;
2040+
auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
2041+
if (!PreCondBI)
2042+
return false;
2043+
2044+
uint64_t PreLoopThreshold;
2045+
if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
2046+
PreLoopThreshold != 2)
18452047
return false;
18462048

2049+
bool ZeroCheck = true;
2050+
2051+
// the loop has only 6 instructions:
2052+
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2053+
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2054+
// %shr = ashr %n.addr.0, 1
2055+
// %tobool = icmp ult %n.addr.0, C
2056+
// %inc = add nsw %i.0, 1
2057+
// br i1 %tobool
2058+
size_t IdiomCanonicalSize = 6;
2059+
if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
2060+
return false;
2061+
2062+
// log2(x) = w − 1 − clz(x)
18472063
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
18482064
DefX->getDebugLoc(), ZeroCheck,
1849-
IsCntPhiUsedOutsideLoop);
2065+
/*IsCntPhiUsedOutsideLoop=*/false,
2066+
/*InsertSub=*/true);
18502067
return true;
18512068
}
18522069

@@ -1961,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
19612178
void LoopIdiomRecognize::transformLoopToCountable(
19622179
Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
19632180
PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
1964-
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
2181+
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
19652182
BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
19662183

19672184
// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
@@ -1991,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
19912208
Type *CountTy = Count->getType();
19922209
Count = Builder.CreateSub(
19932210
ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
2211+
if (InsertSub)
2212+
Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
19942213
Value *NewCount = Count;
19952214
if (IsCntPhiUsedOutsideLoop)
19962215
Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));

0 commit comments

Comments
 (0)