Skip to content

Commit 7d04f92

Browse files
committed
[lld][ELF] Extend profile guided function ordering to ELF binaries
Extend balanced partitioning implementation to support ELF binaries, enabling the same startup time and compressed size optimizations previously available for MachO. This allows ELF binaries to benefit from profile-guided function ordering and compression-based section ordering. Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}`. Thanks to the ellishg, thevinster, and their team's work.
1 parent 1d46020 commit 7d04f92

15 files changed

+1186
-398
lines changed

lld/Common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ add_lld_library(lldCommon
3131
Filesystem.cpp
3232
Memory.cpp
3333
Reproduce.cpp
34+
SectionOrderer.cpp
3435
Strings.cpp
3536
TargetOptionsCommandFlags.cpp
3637
Timer.cpp

lld/Common/SectionOrderer.cpp

Lines changed: 383 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,383 @@
1+
//===- SectionOrderer.cpp---------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "lld/Common/SectionOrderer.h"
10+
#include "lld/Common/ErrorHandler.h"
11+
#include "llvm/ADT/DenseMap.h"
12+
#include "llvm/ADT/DenseSet.h"
13+
#include "llvm/ADT/SetVector.h"
14+
#include "llvm/ADT/SmallSet.h"
15+
#include "llvm/ADT/SmallVector.h"
16+
#include "llvm/ADT/StringMap.h"
17+
#include "llvm/ADT/StringRef.h"
18+
#include "llvm/ProfileData/InstrProfReader.h"
19+
#include "llvm/Support/BalancedPartitioning.h"
20+
#include "llvm/Support/TimeProfiler.h"
21+
#include "llvm/Support/VirtualFileSystem.h"
22+
#include "llvm/Support/xxhash.h"
23+
24+
#define DEBUG_TYPE "bp-section-orderer"
25+
using namespace llvm;
26+
using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
27+
28+
namespace lld {
29+
30+
static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
31+
ArrayRef<const BPSectionBase *> sections,
32+
const DenseMap<const BPSectionBase *, uint64_t> &sectionToIdx,
33+
ArrayRef<unsigned> sectionIdxs,
34+
DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
35+
BPFunctionNode::UtilityNodeT &maxUN) {
36+
TimeTraceScope timeScope("Build nodes for compression");
37+
38+
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
39+
sectionHashes.reserve(sectionIdxs.size());
40+
SmallVector<uint64_t> hashes;
41+
42+
for (unsigned sectionIdx : sectionIdxs) {
43+
const auto *isec = sections[sectionIdx];
44+
isec->getSectionHash(hashes, sectionToIdx);
45+
sectionHashes.emplace_back(sectionIdx, std::move(hashes));
46+
hashes.clear();
47+
}
48+
49+
DenseMap<uint64_t, unsigned> hashFrequency;
50+
for (auto &[sectionIdx, hashes] : sectionHashes)
51+
for (auto hash : hashes)
52+
++hashFrequency[hash];
53+
54+
if (duplicateSectionIdxs) {
55+
// Merge section that are nearly identical
56+
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
57+
DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
58+
for (auto &[sectionIdx, hashes] : sectionHashes) {
59+
uint64_t wholeHash = 0;
60+
for (auto hash : hashes)
61+
if (hashFrequency[hash] > 5)
62+
wholeHash ^= hash;
63+
auto [it, wasInserted] =
64+
wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
65+
if (wasInserted) {
66+
newSectionHashes.emplace_back(sectionIdx, hashes);
67+
} else {
68+
(*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
69+
}
70+
}
71+
sectionHashes = newSectionHashes;
72+
73+
// Recompute hash frequencies
74+
hashFrequency.clear();
75+
for (auto &[sectionIdx, hashes] : sectionHashes)
76+
for (auto hash : hashes)
77+
++hashFrequency[hash];
78+
}
79+
80+
// Filter rare and common hashes and assign each a unique utility node that
81+
// doesn't conflict with the trace utility nodes
82+
DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
83+
for (auto &[hash, frequency] : hashFrequency) {
84+
if (frequency <= 1 || frequency * 2 > sectionHashes.size())
85+
continue;
86+
hashToUN[hash] = ++maxUN;
87+
}
88+
89+
SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
90+
for (auto &[sectionIdx, hashes] : sectionHashes) {
91+
UtilityNodes uns;
92+
for (auto &hash : hashes) {
93+
auto it = hashToUN.find(hash);
94+
if (it != hashToUN.end())
95+
uns.push_back(it->second);
96+
}
97+
sectionUns.emplace_back(sectionIdx, uns);
98+
}
99+
return sectionUns;
100+
}
101+
102+
llvm::DenseMap<const BPSectionBase *, size_t>
103+
SectionOrderer::reorderSectionsByBalancedPartitioning(
104+
size_t &highestAvailablePriority, llvm::StringRef profilePath,
105+
bool forFunctionCompression, bool forDataCompression,
106+
bool compressionSortStartupFunctions, bool verbose,
107+
SmallVector<BPSectionBase *> inputSections) {
108+
TimeTraceScope timeScope("Balanced Partitioning");
109+
SmallVector<const BPSectionBase *> sections;
110+
DenseMap<const BPSectionBase *, uint64_t> sectionToIdx;
111+
StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
112+
113+
// Process input sections
114+
for (const auto *isec : inputSections) {
115+
if (!isec->hasValidData())
116+
continue;
117+
118+
unsigned sectionIdx = sections.size();
119+
sectionToIdx.try_emplace(isec, sectionIdx);
120+
sections.push_back(isec);
121+
122+
for (auto *sym : isec->getSymbols()) {
123+
if (auto *d = sym->asDefinedSymbol())
124+
symbolToSectionIdxs[d->getName()].insert(sectionIdx);
125+
}
126+
}
127+
StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
128+
for (auto &entry : symbolToSectionIdxs) {
129+
StringRef name = entry.getKey();
130+
auto &sectionIdxs = entry.getValue();
131+
name = BPSectionBase::getRootSymbol(name);
132+
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
133+
sectionIdxs.end());
134+
// Linkage names can be prefixed with "_" or "l_" on Mach-O. See
135+
// Mangler::getNameWithPrefix() for details.
136+
if (name.consume_front("_") || name.consume_front("l_"))
137+
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
138+
sectionIdxs.end());
139+
}
140+
141+
BPFunctionNode::UtilityNodeT maxUN = 0;
142+
DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
143+
// Used to define the initial order for startup functions.
144+
DenseMap<unsigned, size_t> sectionIdxToTimestamp;
145+
std::unique_ptr<InstrProfReader> reader;
146+
if (!profilePath.empty()) {
147+
auto fs = vfs::getRealFileSystem();
148+
auto readerOrErr = InstrProfReader::create(profilePath, *fs);
149+
lld::checkError(readerOrErr.takeError());
150+
151+
reader = std::move(readerOrErr.get());
152+
for (auto &entry : *reader) {
153+
// Read all entries
154+
(void)entry;
155+
}
156+
auto &traces = reader->getTemporalProfTraces();
157+
158+
DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
159+
for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
160+
uint64_t currentSize = 0, cutoffSize = 1;
161+
size_t cutoffTimestamp = 1;
162+
auto &trace = traces[traceIdx].FunctionNameRefs;
163+
for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
164+
auto [Filename, ParsedFuncName] = getParsedIRPGOName(
165+
reader->getSymtab().getFuncOrVarName(trace[timestamp]));
166+
ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
167+
168+
auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
169+
if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
170+
continue;
171+
auto &sectionIdxs = sectionIdxsIt->getValue();
172+
// If the same symbol is found in multiple sections, they might be
173+
// identical, so we arbitrarily use the size from the first section.
174+
currentSize += sections[*sectionIdxs.begin()]->getSize();
175+
176+
// Since BalancedPartitioning is sensitive to the initial order, we need
177+
// to explicitly define it to be ordered by earliest timestamp.
178+
for (unsigned sectionIdx : sectionIdxs) {
179+
auto [it, wasInserted] =
180+
sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
181+
if (!wasInserted)
182+
it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
183+
}
184+
185+
if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
186+
++maxUN;
187+
cutoffSize = 2 * currentSize;
188+
cutoffTimestamp = 2 * cutoffTimestamp;
189+
}
190+
for (unsigned sectionIdx : sectionIdxs)
191+
sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
192+
}
193+
for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
194+
for (auto un = firstUN; un <= maxUN; ++un)
195+
startupSectionIdxUNs[sectionIdx].push_back(un);
196+
++maxUN;
197+
sectionIdxToFirstUN.clear();
198+
}
199+
}
200+
201+
SmallVector<unsigned> sectionIdxsForFunctionCompression,
202+
sectionIdxsForDataCompression;
203+
for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
204+
if (startupSectionIdxUNs.count(sectionIdx))
205+
continue;
206+
const auto *isec = sections[sectionIdx];
207+
if (isec->isCodeSection()) {
208+
if (forFunctionCompression)
209+
sectionIdxsForFunctionCompression.push_back(sectionIdx);
210+
} else {
211+
if (forDataCompression)
212+
sectionIdxsForDataCompression.push_back(sectionIdx);
213+
}
214+
}
215+
216+
if (compressionSortStartupFunctions) {
217+
SmallVector<unsigned> startupIdxs;
218+
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
219+
startupIdxs.push_back(sectionIdx);
220+
auto unsForStartupFunctionCompression =
221+
getUnsForCompression(sections, sectionToIdx, startupIdxs,
222+
/*duplicateSectionIdxs=*/nullptr, maxUN);
223+
for (auto &[sectionIdx, compressionUns] :
224+
unsForStartupFunctionCompression) {
225+
auto &uns = startupSectionIdxUNs[sectionIdx];
226+
uns.append(compressionUns);
227+
llvm::sort(uns);
228+
uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
229+
}
230+
}
231+
232+
// Map a section index (order directly) to a list of duplicate section indices
233+
// (not ordered directly).
234+
DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
235+
auto unsForFunctionCompression = getUnsForCompression(
236+
sections, sectionToIdx, sectionIdxsForFunctionCompression,
237+
&duplicateSectionIdxs, maxUN);
238+
auto unsForDataCompression = getUnsForCompression(
239+
sections, sectionToIdx, sectionIdxsForDataCompression,
240+
&duplicateSectionIdxs, maxUN);
241+
242+
std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
243+
nodesForDataCompression;
244+
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
245+
nodesForStartup.emplace_back(sectionIdx, uns);
246+
for (auto &[sectionIdx, uns] : unsForFunctionCompression)
247+
nodesForFunctionCompression.emplace_back(sectionIdx, uns);
248+
for (auto &[sectionIdx, uns] : unsForDataCompression)
249+
nodesForDataCompression.emplace_back(sectionIdx, uns);
250+
251+
// Use the first timestamp to define the initial order for startup nodes.
252+
llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
253+
return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
254+
std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
255+
});
256+
// Sort compression nodes by their Id (which is the section index) because the
257+
// input linker order tends to be not bad.
258+
llvm::sort(nodesForFunctionCompression,
259+
[](auto &L, auto &R) { return L.Id < R.Id; });
260+
llvm::sort(nodesForDataCompression,
261+
[](auto &L, auto &R) { return L.Id < R.Id; });
262+
263+
{
264+
TimeTraceScope timeScope("Balanced Partitioning");
265+
BalancedPartitioningConfig config;
266+
BalancedPartitioning bp(config);
267+
bp.run(nodesForStartup);
268+
bp.run(nodesForFunctionCompression);
269+
bp.run(nodesForDataCompression);
270+
}
271+
272+
unsigned numStartupSections = 0;
273+
unsigned numCodeCompressionSections = 0;
274+
unsigned numDuplicateCodeSections = 0;
275+
unsigned numDataCompressionSections = 0;
276+
unsigned numDuplicateDataSections = 0;
277+
SetVector<const BPSectionBase *> orderedSections;
278+
// Order startup functions,
279+
for (auto &node : nodesForStartup) {
280+
const auto *isec = sections[node.Id];
281+
if (orderedSections.insert(isec))
282+
++numStartupSections;
283+
}
284+
// then functions for compression,
285+
for (auto &node : nodesForFunctionCompression) {
286+
const auto *isec = sections[node.Id];
287+
if (orderedSections.insert(isec))
288+
++numCodeCompressionSections;
289+
290+
auto It = duplicateSectionIdxs.find(node.Id);
291+
if (It == duplicateSectionIdxs.end())
292+
continue;
293+
for (auto dupSecIdx : It->getSecond()) {
294+
const auto *dupIsec = sections[dupSecIdx];
295+
if (orderedSections.insert(dupIsec))
296+
++numDuplicateCodeSections;
297+
}
298+
}
299+
// then data for compression.
300+
for (auto &node : nodesForDataCompression) {
301+
const auto *isec = sections[node.Id];
302+
if (orderedSections.insert(isec))
303+
++numDataCompressionSections;
304+
auto It = duplicateSectionIdxs.find(node.Id);
305+
if (It == duplicateSectionIdxs.end())
306+
continue;
307+
for (auto dupSecIdx : It->getSecond()) {
308+
const auto *dupIsec = sections[dupSecIdx];
309+
if (orderedSections.insert(dupIsec))
310+
++numDuplicateDataSections;
311+
}
312+
}
313+
314+
if (verbose) {
315+
unsigned numTotalOrderedSections =
316+
numStartupSections + numCodeCompressionSections +
317+
numDuplicateCodeSections + numDataCompressionSections +
318+
numDuplicateDataSections;
319+
dbgs()
320+
<< "Ordered " << numTotalOrderedSections
321+
<< " sections using balanced partitioning:\n Functions for startup: "
322+
<< numStartupSections
323+
<< "\n Functions for compression: " << numCodeCompressionSections
324+
<< "\n Duplicate functions: " << numDuplicateCodeSections
325+
<< "\n Data for compression: " << numDataCompressionSections
326+
<< "\n Duplicate data: " << numDuplicateDataSections << "\n";
327+
328+
if (!profilePath.empty()) {
329+
// Evaluate this function order for startup
330+
StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
331+
const uint64_t pageSize = (1 << 14);
332+
uint64_t currentAddress = 0;
333+
for (const auto *isec : orderedSections) {
334+
for (auto *sym : isec->getSymbols()) {
335+
if (auto *d = sym->asDefinedSymbol()) {
336+
uint64_t startAddress = currentAddress + d->getValue();
337+
uint64_t endAddress = startAddress + d->getSize();
338+
uint64_t firstPage = startAddress / pageSize;
339+
// I think the kernel might pull in a few pages when one it touched,
340+
// so it might be more accurate to force lastPage to be aligned by
341+
// 4?
342+
uint64_t lastPage = endAddress / pageSize;
343+
StringRef rootSymbol = d->getName();
344+
rootSymbol = BPSectionBase::getRootSymbol(rootSymbol);
345+
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
346+
if (rootSymbol.consume_front("_") || rootSymbol.consume_front("l_"))
347+
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
348+
}
349+
}
350+
currentAddress += isec->getSize();
351+
}
352+
353+
// The area under the curve F where F(t) is the total number of page
354+
// faults at step t.
355+
unsigned area = 0;
356+
for (auto &trace : reader->getTemporalProfTraces()) {
357+
SmallSet<uint64_t, 0> touchedPages;
358+
for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
359+
auto traceId = trace.FunctionNameRefs[step];
360+
auto [Filename, ParsedFuncName] =
361+
getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
362+
ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
363+
auto it = symbolToPageNumbers.find(ParsedFuncName);
364+
if (it != symbolToPageNumbers.end()) {
365+
auto &[firstPage, lastPage] = it->getValue();
366+
for (uint64_t i = firstPage; i <= lastPage; i++)
367+
touchedPages.insert(i);
368+
}
369+
area += touchedPages.size();
370+
}
371+
}
372+
dbgs() << "Total area under the page fault curve: " << (float)area
373+
<< "\n";
374+
}
375+
}
376+
377+
DenseMap<const BPSectionBase *, size_t> sectionPriorities;
378+
for (const auto *isec : orderedSections)
379+
sectionPriorities[isec] = --highestAvailablePriority;
380+
return sectionPriorities;
381+
}
382+
383+
} // namespace lld

0 commit comments

Comments
 (0)