Skip to content
This repository was archived by the owner on Sep 2, 2018. It is now read-only.

Commit 9b12d6a

Browse files
author
James Molloy
committed
[Thumb-1] Synthesize TBB/TBH instructions to make use of compressed jump tables
[Reapplying r284580 and r285917 with fix and testing to ensure emitted jump tables for Thumb-1 have 4-byte alignment] The TBB and TBH instructions in Thumb-2 allow jump tables to be compressed into sequences of bytes or shorts respectively. These instructions do not exist in Thumb-1, however it is possible to synthesize them out of a sequence of other instructions. It turns out this sequence is so short that it's almost never a lose for performance and is ALWAYS a significant win for code size. TBB example: Before: lsls r0, r0, #2 After: add r0, pc adr r1, .LJTI0_0 ldrb r0, [r0, #6] ldr r0, [r0, r1] lsls r0, r0, #1 mov pc, r0 add pc, r0 => No change in prologue code size or dynamic instruction count. Jump table shrunk by a factor of 4. The only case that can increase dynamic instruction count is the TBH case: Before: lsls r0, r4, #2 After: lsls r4, r4, #1 adr r1, .LJTI0_0 add r4, pc ldr r0, [r0, r1] ldrh r4, [r4, #6] mov pc, r0 lsls r4, r4, #1 add pc, r4 => 1 more instruction in prologue. Jump table shrunk by a factor of 2. So there is an argument that this should be disabled when optimizing for performance (and a TBH needs to be generated). I'm not so sure about that in practice, because on small cores with Thumb-1 performance is often tied to code size. But I'm willing to turn it off when optimizing for performance if people want (also note that TBHs are fairly rare in practice!) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285690 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent a66e032 commit 9b12d6a

9 files changed

+314
-38
lines changed

lib/Target/ARM/ARMAsmPrinter.cpp

+80
Original file line numberDiff line numberDiff line change
@@ -1166,6 +1166,9 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
11661166
const MachineOperand &MO1 = MI->getOperand(1);
11671167
unsigned JTI = MO1.getIndex();
11681168

1169+
if (Subtarget->isThumb1Only())
1170+
EmitAlignment(2);
1171+
11691172
MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
11701173
OutStreamer->EmitLabel(JTISymbol);
11711174

@@ -1712,6 +1715,83 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
17121715
.addReg(0));
17131716
return;
17141717
}
1718+
case ARM::tTBB_JT:
1719+
case ARM::tTBH_JT: {
1720+
1721+
bool Is8Bit = MI->getOpcode() == ARM::tTBB_JT;
1722+
unsigned Base = MI->getOperand(0).getReg();
1723+
unsigned Idx = MI->getOperand(1).getReg();
1724+
assert(MI->getOperand(1).isKill() && "We need the index register as scratch!");
1725+
1726+
// Multiply up idx if necessary.
1727+
if (!Is8Bit)
1728+
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
1729+
.addReg(Idx)
1730+
.addReg(ARM::CPSR)
1731+
.addReg(Idx)
1732+
.addImm(1)
1733+
// Add predicate operands.
1734+
.addImm(ARMCC::AL)
1735+
.addReg(0));
1736+
1737+
if (Base == ARM::PC) {
1738+
// TBB [base, idx] =
1739+
// ADDS idx, idx, base
1740+
// LDRB idx, [idx, #4] ; or LDRH if TBH
1741+
// LSLS idx, #1
1742+
// ADDS pc, pc, idx
1743+
1744+
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
1745+
.addReg(Idx)
1746+
.addReg(Idx)
1747+
.addReg(Base)
1748+
// Add predicate operands.
1749+
.addImm(ARMCC::AL)
1750+
.addReg(0));
1751+
1752+
unsigned Opc = Is8Bit ? ARM::tLDRBi : ARM::tLDRHi;
1753+
EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
1754+
.addReg(Idx)
1755+
.addReg(Idx)
1756+
.addImm(Is8Bit ? 4 : 2)
1757+
// Add predicate operands.
1758+
.addImm(ARMCC::AL)
1759+
.addReg(0));
1760+
} else {
1761+
// TBB [base, idx] =
1762+
// LDRB idx, [base, idx] ; or LDRH if TBH
1763+
// LSLS idx, #1
1764+
// ADDS pc, pc, idx
1765+
1766+
unsigned Opc = Is8Bit ? ARM::tLDRBr : ARM::tLDRHr;
1767+
EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
1768+
.addReg(Idx)
1769+
.addReg(Base)
1770+
.addReg(Idx)
1771+
// Add predicate operands.
1772+
.addImm(ARMCC::AL)
1773+
.addReg(0));
1774+
}
1775+
1776+
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
1777+
.addReg(Idx)
1778+
.addReg(ARM::CPSR)
1779+
.addReg(Idx)
1780+
.addImm(1)
1781+
// Add predicate operands.
1782+
.addImm(ARMCC::AL)
1783+
.addReg(0));
1784+
1785+
OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
1786+
EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
1787+
.addReg(ARM::PC)
1788+
.addReg(ARM::PC)
1789+
.addReg(Idx)
1790+
// Add predicate operands.
1791+
.addImm(ARMCC::AL)
1792+
.addReg(0));
1793+
return;
1794+
}
17151795
case ARM::tBR_JTr:
17161796
case ARM::BR_JTr: {
17171797
// Lower and emit the instruction itself, then the jump table following it.

lib/Target/ARM/ARMConstantIslandPass.cpp

+93-15
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ static cl::opt<unsigned>
5858
CPMaxIteration("arm-constant-island-max-iteration", cl::Hidden, cl::init(30),
5959
cl::desc("The max number of iteration for converge"));
6060

61+
static cl::opt<bool> SynthesizeThumb1TBB(
62+
"arm-synthesize-thumb-1-tbb", cl::Hidden, cl::init(true),
63+
cl::desc("Use compressed jump tables in Thumb-1 by synthesizing an "
64+
"equivalent to the TBB/TBH instructions"));
65+
6166
namespace {
6267
/// ARMConstantIslands - Due to limited PC-relative displacements, ARM
6368
/// requires constant pool entries to be scattered among the instructions
@@ -189,6 +194,7 @@ namespace {
189194
bool isThumb;
190195
bool isThumb1;
191196
bool isThumb2;
197+
bool isPositionIndependentOrROPI;
192198
public:
193199
static char ID;
194200
ARMConstantIslands() : MachineFunctionPass(ID) {}
@@ -319,13 +325,16 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
319325

320326
STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
321327
TII = STI->getInstrInfo();
328+
isPositionIndependentOrROPI =
329+
STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
322330
AFI = MF->getInfo<ARMFunctionInfo>();
323331

324332
isThumb = AFI->isThumbFunction();
325333
isThumb1 = AFI->isThumb1OnlyFunction();
326334
isThumb2 = AFI->isThumb2Function();
327335

328336
HasFarJump = false;
337+
bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
329338

330339
// This pass invalidates liveness information when it splits basic blocks.
331340
MF->getRegInfo().invalidateLiveness();
@@ -337,7 +346,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
337346
// Try to reorder and otherwise adjust the block layout to make good use
338347
// of the TB[BH] instructions.
339348
bool MadeChange = false;
340-
if (isThumb2 && AdjustJumpTableBlocks) {
349+
if (GenerateTBB && AdjustJumpTableBlocks) {
341350
scanFunctionJumpTables();
342351
MadeChange |= reorderThumb2JumpTables();
343352
// Data is out of date, so clear it. It'll be re-computed later.
@@ -414,7 +423,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
414423
MadeChange |= optimizeThumb2Branches();
415424

416425
// Optimize jump tables using TBB / TBH.
417-
if (isThumb2)
426+
if (GenerateTBB)
418427
MadeChange |= optimizeThumb2JumpTables();
419428

420429
// After a while, this might be made debug-only, but it is not expensive.
@@ -540,9 +549,11 @@ void ARMConstantIslands::doInitialJumpTablePlacement(
540549
case ARM::t2BR_JT:
541550
JTOpcode = ARM::JUMPTABLE_INSTS;
542551
break;
552+
case ARM::tTBB_JT:
543553
case ARM::t2TBB_JT:
544554
JTOpcode = ARM::JUMPTABLE_TBB;
545555
break;
556+
case ARM::tTBH_JT:
546557
case ARM::t2TBH_JT:
547558
JTOpcode = ARM::JUMPTABLE_TBH;
548559
break;
@@ -615,8 +626,9 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
615626
case ARM::CONSTPOOL_ENTRY:
616627
break;
617628
case ARM::JUMPTABLE_TBB:
618-
return 0;
629+
return isThumb1 ? 2 : 0;
619630
case ARM::JUMPTABLE_TBH:
631+
return isThumb1 ? 2 : 1;
620632
case ARM::JUMPTABLE_INSTS:
621633
return 1;
622634
case ARM::JUMPTABLE_ADDRS:
@@ -638,7 +650,8 @@ unsigned ARMConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
638650
void ARMConstantIslands::scanFunctionJumpTables() {
639651
for (MachineBasicBlock &MBB : *MF) {
640652
for (MachineInstr &I : MBB)
641-
if (I.isBranch() && I.getOpcode() == ARM::t2BR_JT)
653+
if (I.isBranch() &&
654+
(I.getOpcode() == ARM::t2BR_JT || I.getOpcode() == ARM::tBR_JTr))
642655
T2JumpTables.push_back(&I);
643656
}
644657
}
@@ -679,6 +692,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
679692
default:
680693
continue; // Ignore other JT branches
681694
case ARM::t2BR_JT:
695+
case ARM::tBR_JTr:
682696
T2JumpTables.push_back(&I);
683697
continue; // Does not get an entry in ImmBranches
684698
case ARM::Bcc:
@@ -1943,7 +1957,7 @@ bool ARMConstantIslands::preserveBaseRegister(MachineInstr *JumpMI,
19431957

19441958
if (RemovableAdd) {
19451959
RemovableAdd->eraseFromParent();
1946-
DeadSize += 4;
1960+
DeadSize += isThumb2 ? 4 : 2;
19471961
} else if (BaseReg == EntryReg) {
19481962
// The add wasn't removable, but clobbered the base for the TBB. So we can't
19491963
// preserve it.
@@ -2010,25 +2024,82 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
20102024
if (!ByteOk && !HalfWordOk)
20112025
continue;
20122026

2027+
CPUser &User = CPUsers[JumpTableUserIndices[JTI]];
20132028
MachineBasicBlock *MBB = MI->getParent();
20142029
if (!MI->getOperand(0).isKill()) // FIXME: needed now?
20152030
continue;
2016-
unsigned IdxReg = MI->getOperand(1).getReg();
2017-
bool IdxRegKill = MI->getOperand(1).isKill();
20182031

2019-
CPUser &User = CPUsers[JumpTableUserIndices[JTI]];
20202032
unsigned DeadSize = 0;
20212033
bool CanDeleteLEA = false;
20222034
bool BaseRegKill = false;
2023-
bool PreservedBaseReg =
2035+
2036+
unsigned IdxReg = ~0U;
2037+
bool IdxRegKill = true;
2038+
if (isThumb2) {
2039+
IdxReg = MI->getOperand(1).getReg();
2040+
IdxRegKill = MI->getOperand(1).isKill();
2041+
2042+
bool PreservedBaseReg =
20242043
preserveBaseRegister(MI, User.MI, DeadSize, CanDeleteLEA, BaseRegKill);
2044+
if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg)
2045+
continue;
2046+
} else {
2047+
// We're in thumb-1 mode, so we must have something like:
2048+
// %idx = tLSLri %idx, 2
2049+
// %base = tLEApcrelJT
2050+
// %t = tLDRr %idx, %base
2051+
unsigned BaseReg = User.MI->getOperand(0).getReg();
2052+
2053+
if (User.MI->getIterator() == User.MI->getParent()->begin())
2054+
continue;
2055+
MachineInstr *Shift = User.MI->getPrevNode();
2056+
if (Shift->getOpcode() != ARM::tLSLri ||
2057+
Shift->getOperand(3).getImm() != 2 ||
2058+
!Shift->getOperand(2).isKill())
2059+
continue;
2060+
IdxReg = Shift->getOperand(2).getReg();
2061+
unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
20252062

2026-
if (!jumpTableFollowsTB(MI, User.CPEMI) && !PreservedBaseReg)
2027-
continue;
2063+
MachineInstr *Load = User.MI->getNextNode();
2064+
if (Load->getOpcode() != ARM::tLDRr)
2065+
continue;
2066+
if (Load->getOperand(1).getReg() != ShiftedIdxReg ||
2067+
Load->getOperand(2).getReg() != BaseReg ||
2068+
!Load->getOperand(1).isKill())
2069+
continue;
20282070

2071+
// If we're in PIC mode, there should be another ADD following.
2072+
if (isPositionIndependentOrROPI) {
2073+
MachineInstr *Add = Load->getNextNode();
2074+
if (Add->getOpcode() != ARM::tADDrr ||
2075+
Add->getOperand(2).getReg() != Load->getOperand(0).getReg() ||
2076+
Add->getOperand(3).getReg() != BaseReg ||
2077+
!Add->getOperand(2).isKill())
2078+
continue;
2079+
if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
2080+
continue;
2081+
2082+
Add->eraseFromParent();
2083+
DeadSize += 2;
2084+
} else {
2085+
if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
2086+
continue;
2087+
}
2088+
2089+
2090+
// Now safe to delete the load and lsl. The LEA will be removed later.
2091+
CanDeleteLEA = true;
2092+
Shift->eraseFromParent();
2093+
Load->eraseFromParent();
2094+
DeadSize += 4;
2095+
}
2096+
20292097
DEBUG(dbgs() << "Shrink JT: " << *MI);
20302098
MachineInstr *CPEMI = User.CPEMI;
20312099
unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
2100+
if (!isThumb2)
2101+
Opc = ByteOk ? ARM::tTBB_JT : ARM::tTBH_JT;
2102+
20322103
MachineBasicBlock::iterator MI_JT = MI;
20332104
MachineInstr *NewJTMI =
20342105
BuildMI(*MBB, MI_JT, MI->getDebugLoc(), TII->get(Opc))
@@ -2048,7 +2119,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
20482119

20492120
if (CanDeleteLEA) {
20502121
User.MI->eraseFromParent();
2051-
DeadSize += 4;
2122+
DeadSize += isThumb2 ? 4 : 2;
20522123

20532124
// The LEA was eliminated, the TBB instruction becomes the only new user
20542125
// of the jump table.
@@ -2164,9 +2235,16 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
21642235
// Add an unconditional branch from NewBB to BB.
21652236
// There doesn't seem to be meaningful DebugInfo available; this doesn't
21662237
// correspond directly to anything in the source.
2167-
assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
2168-
BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B)).addMBB(BB)
2169-
.addImm(ARMCC::AL).addReg(0);
2238+
if (isThumb2)
2239+
BuildMI(NewBB, DebugLoc(), TII->get(ARM::t2B))
2240+
.addMBB(BB)
2241+
.addImm(ARMCC::AL)
2242+
.addReg(0);
2243+
else
2244+
BuildMI(NewBB, DebugLoc(), TII->get(ARM::tB))
2245+
.addMBB(BB)
2246+
.addImm(ARMCC::AL)
2247+
.addReg(0);
21702248

21712249
// Update internal data structures to account for the newly inserted MBB.
21722250
MF->RenumberBlocks(NewBB);

lib/Target/ARM/ARMInstrThumb.td

+12
Original file line numberDiff line numberDiff line change
@@ -1315,6 +1315,18 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
13151315
(ins i32imm:$label, pred:$p),
13161316
2, IIC_iALUi, []>, Sched<[WriteALU]>;
13171317

1318+
// Thumb-1 doesn't have the TBB or TBH instructions, but we can synthesize them
1319+
// and make use of the same compressed jump table format as Thumb-2.
1320+
let Size = 2 in {
1321+
def tTBB_JT : tPseudoInst<(outs),
1322+
(ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
1323+
Sched<[WriteBr]>;
1324+
1325+
def tTBH_JT : tPseudoInst<(outs),
1326+
(ins tGPR:$base, tGPR:$index, i32imm:$jt, i32imm:$pclbl), 0, IIC_Br, []>,
1327+
Sched<[WriteBr]>;
1328+
}
1329+
13181330
//===----------------------------------------------------------------------===//
13191331
// TLS Instructions
13201332
//

test/CodeGen/ARM/arm-position-independence-jump-table.ll

+14-21
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
; RUN: llc -relocation-model=ropi -mtriple=thumbv7m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2
99
; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv7m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB2
1010

11-
; RUN: llc -relocation-model=static -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1 --check-prefix=THUMB1_ABS
12-
; RUN: llc -relocation-model=ropi -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1 --check-prefix=THUMB1_PC
13-
; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1 --check-prefix=THUMB1_PC
11+
; RUN: llc -relocation-model=static -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
12+
; RUN: llc -relocation-model=ropi -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
13+
; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv6m--none-eabi -disable-block-placement < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1
1414

1515

1616
declare void @exit0()
@@ -85,30 +85,23 @@ lab4:
8585
; THUMB2: [[LBB4]]
8686
; THUMB2-NEXT: b exit4
8787

88-
; THUMB1: lsls r[[R_TAB_INDEX:[0-9]+]], r{{[0-9]+}}, #2
89-
; THUMB1: adr r[[R_TAB_BASE:[0-9]+]], [[LJTI:\.LJTI[0-9]+_[0-9]+]]
90-
; THUMB1: ldr r[[R_BB_ADDR:[0-9]+]], [r[[R_TAB_INDEX]], r[[R_TAB_BASE]]]
91-
; THUMB1_PC: adds r[[R_BB_ADDR]], r[[R_BB_ADDR]], r[[R_TAB_BASE]]
92-
; THUMB1: mov pc, r[[R_BB_ADDR]]
93-
; THUMB1: [[LJTI]]
94-
; THUMB1_ABS: .long [[LBB1:\.LBB[0-9]+_[0-9]+]]+1
95-
; THUMB1_ABS: .long [[LBB2:\.LBB[0-9]+_[0-9]+]]+1
96-
; THUMB1_ABS: .long [[LBB3:\.LBB[0-9]+_[0-9]+]]+1
97-
; THUMB1_ABS: .long [[LBB4:\.LBB[0-9]+_[0-9]+]]+1
98-
; THUMB1_PC: .long [[LBB1:\.LBB[0-9]+_[0-9]+]]-[[LJTI]]
99-
; THUMB1_PC: .long [[LBB2:\.LBB[0-9]+_[0-9]+]]-[[LJTI]]
100-
; THUMB1_PC: .long [[LBB3:\.LBB[0-9]+_[0-9]+]]-[[LJTI]]
101-
; THUMB1_PC: .long [[LBB4:\.LBB[0-9]+_[0-9]+]]-[[LJTI]]
88+
89+
; THUMB1: add r[[x:[0-9]+]], pc
90+
; THUMB1: ldrb r[[x]], [r[[x]], #4]
91+
; THUMB1: lsls r[[x]], r[[x]], #1
92+
; THUMB1: [[LCPI:\.LCPI[0-9]+_[0-9]+]]:
93+
; THUMB1: add pc, r[[x]]
94+
; THUMB1: .p2align 2
95+
; THUMB1: .byte ([[LBB1:\.LBB[0-9]+_[0-9]+]]-([[LCPI]]+4))/2
96+
; THUMB1: .byte ([[LBB2:\.LBB[0-9]+_[0-9]+]]-([[LCPI]]+4))/2
97+
; THUMB1: .byte ([[LBB3:\.LBB[0-9]+_[0-9]+]]-([[LCPI]]+4))/2
98+
; THUMB1: .byte ([[LBB4:\.LBB[0-9]+_[0-9]+]]-([[LCPI]]+4))/2
10299
; THUMB1: [[LBB1]]
103100
; THUMB1-NEXT: bl exit1
104-
; THUMB1-NEXT: pop
105101
; THUMB1: [[LBB2]]
106102
; THUMB1-NEXT: bl exit2
107-
; THUMB1-NEXT: pop
108103
; THUMB1: [[LBB3]]
109104
; THUMB1-NEXT: bl exit3
110-
; THUMB1-NEXT: pop
111105
; THUMB1: [[LBB4]]
112106
; THUMB1-NEXT: bl exit4
113-
; THUMB1-NEXT: pop
114107
}

0 commit comments

Comments
 (0)