Skip to content

Commit 7355aae

Browse files
committed
[lld][ELF] Extend profile guided function ordering to ELF binaries
Extend balanced partitioning implementation to support ELF binaries, enabling the same startup time and compressed size optimizations previously available for MachO. This allows ELF binaries to benefit from profile-guided function ordering and compression-based section ordering. Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}`. Thanks to the ellishg, thevinster, and their team's work.
1 parent 53e9eee commit 7355aae

15 files changed

+1273
-398
lines changed

lld/Common/BPSectionOrdererBase.cpp

Lines changed: 379 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,379 @@
1+
//===- BPSectionOrdererBase.cpp -------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "lld/Common/BPSectionOrdererBase.h"
10+
#include "lld/Common/ErrorHandler.h"
11+
#include "llvm/ADT/DenseMap.h"
12+
#include "llvm/ADT/DenseSet.h"
13+
#include "llvm/ADT/SetVector.h"
14+
#include "llvm/ADT/SmallSet.h"
15+
#include "llvm/ADT/SmallVector.h"
16+
#include "llvm/ADT/StringMap.h"
17+
#include "llvm/ADT/StringRef.h"
18+
#include "llvm/ProfileData/InstrProfReader.h"
19+
#include "llvm/Support/BalancedPartitioning.h"
20+
#include "llvm/Support/TimeProfiler.h"
21+
#include "llvm/Support/VirtualFileSystem.h"
22+
#include "llvm/Support/xxhash.h"
23+
24+
#define DEBUG_TYPE "bp-section-orderer"
25+
using namespace llvm;
26+
using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
27+
28+
namespace lld {
29+
30+
static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
31+
ArrayRef<const BPSectionBase *> sections,
32+
const DenseMap<const BPSectionBase *, uint64_t> &sectionToIdx,
33+
ArrayRef<unsigned> sectionIdxs,
34+
DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
35+
BPFunctionNode::UtilityNodeT &maxUN) {
36+
TimeTraceScope timeScope("Build nodes for compression");
37+
38+
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> sectionHashes;
39+
sectionHashes.reserve(sectionIdxs.size());
40+
SmallVector<uint64_t> hashes;
41+
42+
for (unsigned sectionIdx : sectionIdxs) {
43+
const auto *isec = sections[sectionIdx];
44+
isec->getSectionHash(hashes);
45+
sectionHashes.emplace_back(sectionIdx, std::move(hashes));
46+
hashes.clear();
47+
}
48+
49+
DenseMap<uint64_t, unsigned> hashFrequency;
50+
for (auto &[sectionIdx, hashes] : sectionHashes)
51+
for (auto hash : hashes)
52+
++hashFrequency[hash];
53+
54+
if (duplicateSectionIdxs) {
55+
// Merge sections that are nearly identical
56+
SmallVector<std::pair<unsigned, SmallVector<uint64_t>>> newSectionHashes;
57+
DenseMap<uint64_t, unsigned> wholeHashToSectionIdx;
58+
for (auto &[sectionIdx, hashes] : sectionHashes) {
59+
uint64_t wholeHash = 0;
60+
for (auto hash : hashes)
61+
if (hashFrequency[hash] > 5)
62+
wholeHash ^= hash;
63+
auto [it, wasInserted] =
64+
wholeHashToSectionIdx.insert(std::make_pair(wholeHash, sectionIdx));
65+
if (wasInserted) {
66+
newSectionHashes.emplace_back(sectionIdx, hashes);
67+
} else {
68+
(*duplicateSectionIdxs)[it->getSecond()].push_back(sectionIdx);
69+
}
70+
}
71+
sectionHashes = newSectionHashes;
72+
73+
// Recompute hash frequencies
74+
hashFrequency.clear();
75+
for (auto &[sectionIdx, hashes] : sectionHashes)
76+
for (auto hash : hashes)
77+
++hashFrequency[hash];
78+
}
79+
80+
// Filter rare and common hashes and assign each a unique utility node that
81+
// doesn't conflict with the trace utility nodes
82+
DenseMap<uint64_t, BPFunctionNode::UtilityNodeT> hashToUN;
83+
for (auto &[hash, frequency] : hashFrequency) {
84+
if (frequency <= 1 || frequency * 2 > sectionHashes.size())
85+
continue;
86+
hashToUN[hash] = ++maxUN;
87+
}
88+
89+
SmallVector<std::pair<unsigned, UtilityNodes>> sectionUns;
90+
for (auto &[sectionIdx, hashes] : sectionHashes) {
91+
UtilityNodes uns;
92+
for (auto &hash : hashes) {
93+
auto it = hashToUN.find(hash);
94+
if (it != hashToUN.end())
95+
uns.push_back(it->second);
96+
}
97+
sectionUns.emplace_back(sectionIdx, uns);
98+
}
99+
return sectionUns;
100+
}
101+
102+
llvm::DenseMap<const BPSectionBase *, size_t>
103+
BPSectionOrdererBase::reorderSectionsByBalancedPartitioning(
104+
size_t &highestAvailablePriority, llvm::StringRef profilePath,
105+
bool forFunctionCompression, bool forDataCompression,
106+
bool compressionSortStartupFunctions, bool verbose,
107+
SmallVector<std::unique_ptr<BPSectionBase>> &inputSections) {
108+
TimeTraceScope timeScope("Setup Balanced Partitioning");
109+
SmallVector<const BPSectionBase *> sections;
110+
DenseMap<const BPSectionBase *, uint64_t> sectionToIdx;
111+
StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
112+
113+
// Process input sections
114+
for (const auto &isec : inputSections) {
115+
if (!isec->hasValidData())
116+
continue;
117+
118+
unsigned sectionIdx = sections.size();
119+
sectionToIdx.try_emplace(isec.get(), sectionIdx);
120+
sections.emplace_back(isec.get());
121+
for (auto &sym : isec->getSymbols())
122+
if (auto *d = sym->asDefinedSymbol())
123+
symbolToSectionIdxs[d->getName()].insert(sectionIdx);
124+
}
125+
StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
126+
for (auto &entry : symbolToSectionIdxs) {
127+
StringRef name = entry.getKey();
128+
auto &sectionIdxs = entry.getValue();
129+
name = BPSectionBase::getRootSymbol(name);
130+
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
131+
sectionIdxs.end());
132+
if (sections[*sectionIdxs.begin()]->needResolveLinkageName(name))
133+
rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
134+
sectionIdxs.end());
135+
}
136+
137+
BPFunctionNode::UtilityNodeT maxUN = 0;
138+
DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
139+
// Used to define the initial order for startup functions.
140+
DenseMap<unsigned, size_t> sectionIdxToTimestamp;
141+
std::unique_ptr<InstrProfReader> reader;
142+
if (!profilePath.empty()) {
143+
auto fs = vfs::getRealFileSystem();
144+
auto readerOrErr = InstrProfReader::create(profilePath, *fs);
145+
lld::checkError(readerOrErr.takeError());
146+
147+
reader = std::move(readerOrErr.get());
148+
for (auto &entry : *reader) {
149+
// Read all entries
150+
(void)entry;
151+
}
152+
auto &traces = reader->getTemporalProfTraces();
153+
154+
DenseMap<unsigned, BPFunctionNode::UtilityNodeT> sectionIdxToFirstUN;
155+
for (size_t traceIdx = 0; traceIdx < traces.size(); traceIdx++) {
156+
uint64_t currentSize = 0, cutoffSize = 1;
157+
size_t cutoffTimestamp = 1;
158+
auto &trace = traces[traceIdx].FunctionNameRefs;
159+
for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
160+
auto [Filename, ParsedFuncName] = getParsedIRPGOName(
161+
reader->getSymtab().getFuncOrVarName(trace[timestamp]));
162+
ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
163+
164+
auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
165+
if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
166+
continue;
167+
auto &sectionIdxs = sectionIdxsIt->getValue();
168+
// If the same symbol is found in multiple sections, they might be
169+
// identical, so we arbitrarily use the size from the first section.
170+
currentSize += sections[*sectionIdxs.begin()]->getSize();
171+
172+
// Since BalancedPartitioning is sensitive to the initial order, we need
173+
// to explicitly define it to be ordered by earliest timestamp.
174+
for (unsigned sectionIdx : sectionIdxs) {
175+
auto [it, wasInserted] =
176+
sectionIdxToTimestamp.try_emplace(sectionIdx, timestamp);
177+
if (!wasInserted)
178+
it->getSecond() = std::min<size_t>(it->getSecond(), timestamp);
179+
}
180+
181+
if (timestamp >= cutoffTimestamp || currentSize >= cutoffSize) {
182+
++maxUN;
183+
cutoffSize = 2 * currentSize;
184+
cutoffTimestamp = 2 * cutoffTimestamp;
185+
}
186+
for (unsigned sectionIdx : sectionIdxs)
187+
sectionIdxToFirstUN.try_emplace(sectionIdx, maxUN);
188+
}
189+
for (auto &[sectionIdx, firstUN] : sectionIdxToFirstUN)
190+
for (auto un = firstUN; un <= maxUN; ++un)
191+
startupSectionIdxUNs[sectionIdx].push_back(un);
192+
++maxUN;
193+
sectionIdxToFirstUN.clear();
194+
}
195+
}
196+
197+
SmallVector<unsigned> sectionIdxsForFunctionCompression,
198+
sectionIdxsForDataCompression;
199+
for (unsigned sectionIdx = 0; sectionIdx < sections.size(); sectionIdx++) {
200+
if (startupSectionIdxUNs.count(sectionIdx))
201+
continue;
202+
const auto *isec = sections[sectionIdx];
203+
if (isec->isCodeSection()) {
204+
if (forFunctionCompression)
205+
sectionIdxsForFunctionCompression.push_back(sectionIdx);
206+
} else {
207+
if (forDataCompression)
208+
sectionIdxsForDataCompression.push_back(sectionIdx);
209+
}
210+
}
211+
212+
if (compressionSortStartupFunctions) {
213+
SmallVector<unsigned> startupIdxs;
214+
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
215+
startupIdxs.push_back(sectionIdx);
216+
auto unsForStartupFunctionCompression =
217+
getUnsForCompression(sections, sectionToIdx, startupIdxs,
218+
/*duplicateSectionIdxs=*/nullptr, maxUN);
219+
for (auto &[sectionIdx, compressionUns] :
220+
unsForStartupFunctionCompression) {
221+
auto &uns = startupSectionIdxUNs[sectionIdx];
222+
uns.append(compressionUns);
223+
llvm::sort(uns);
224+
uns.erase(std::unique(uns.begin(), uns.end()), uns.end());
225+
}
226+
}
227+
228+
// Map a section index (order directly) to a list of duplicate section indices
229+
// (not ordered directly).
230+
DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
231+
auto unsForFunctionCompression = getUnsForCompression(
232+
sections, sectionToIdx, sectionIdxsForFunctionCompression,
233+
&duplicateSectionIdxs, maxUN);
234+
auto unsForDataCompression = getUnsForCompression(
235+
sections, sectionToIdx, sectionIdxsForDataCompression,
236+
&duplicateSectionIdxs, maxUN);
237+
238+
std::vector<BPFunctionNode> nodesForStartup, nodesForFunctionCompression,
239+
nodesForDataCompression;
240+
for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
241+
nodesForStartup.emplace_back(sectionIdx, uns);
242+
for (auto &[sectionIdx, uns] : unsForFunctionCompression)
243+
nodesForFunctionCompression.emplace_back(sectionIdx, uns);
244+
for (auto &[sectionIdx, uns] : unsForDataCompression)
245+
nodesForDataCompression.emplace_back(sectionIdx, uns);
246+
247+
// Use the first timestamp to define the initial order for startup nodes.
248+
llvm::sort(nodesForStartup, [&sectionIdxToTimestamp](auto &L, auto &R) {
249+
return std::make_pair(sectionIdxToTimestamp[L.Id], L.Id) <
250+
std::make_pair(sectionIdxToTimestamp[R.Id], R.Id);
251+
});
252+
// Sort compression nodes by their Id (which is the section index) because the
253+
// input linker order tends to be not bad.
254+
llvm::sort(nodesForFunctionCompression,
255+
[](auto &L, auto &R) { return L.Id < R.Id; });
256+
llvm::sort(nodesForDataCompression,
257+
[](auto &L, auto &R) { return L.Id < R.Id; });
258+
259+
{
260+
TimeTraceScope timeScope("Balanced Partitioning");
261+
BalancedPartitioningConfig config;
262+
BalancedPartitioning bp(config);
263+
bp.run(nodesForStartup);
264+
bp.run(nodesForFunctionCompression);
265+
bp.run(nodesForDataCompression);
266+
}
267+
268+
unsigned numStartupSections = 0;
269+
unsigned numCodeCompressionSections = 0;
270+
unsigned numDuplicateCodeSections = 0;
271+
unsigned numDataCompressionSections = 0;
272+
unsigned numDuplicateDataSections = 0;
273+
SetVector<const BPSectionBase *> orderedSections;
274+
// Order startup functions,
275+
for (auto &node : nodesForStartup) {
276+
const auto *isec = sections[node.Id];
277+
if (orderedSections.insert(isec))
278+
++numStartupSections;
279+
}
280+
// then functions for compression,
281+
for (auto &node : nodesForFunctionCompression) {
282+
const auto *isec = sections[node.Id];
283+
if (orderedSections.insert(isec))
284+
++numCodeCompressionSections;
285+
286+
auto It = duplicateSectionIdxs.find(node.Id);
287+
if (It == duplicateSectionIdxs.end())
288+
continue;
289+
for (auto dupSecIdx : It->getSecond()) {
290+
const auto *dupIsec = sections[dupSecIdx];
291+
if (orderedSections.insert(dupIsec))
292+
++numDuplicateCodeSections;
293+
}
294+
}
295+
// then data for compression.
296+
for (auto &node : nodesForDataCompression) {
297+
const auto *isec = sections[node.Id];
298+
if (orderedSections.insert(isec))
299+
++numDataCompressionSections;
300+
auto It = duplicateSectionIdxs.find(node.Id);
301+
if (It == duplicateSectionIdxs.end())
302+
continue;
303+
for (auto dupSecIdx : It->getSecond()) {
304+
const auto *dupIsec = sections[dupSecIdx];
305+
if (orderedSections.insert(dupIsec))
306+
++numDuplicateDataSections;
307+
}
308+
}
309+
310+
if (verbose) {
311+
unsigned numTotalOrderedSections =
312+
numStartupSections + numCodeCompressionSections +
313+
numDuplicateCodeSections + numDataCompressionSections +
314+
numDuplicateDataSections;
315+
dbgs()
316+
<< "Ordered " << numTotalOrderedSections
317+
<< " sections using balanced partitioning:\n Functions for startup: "
318+
<< numStartupSections
319+
<< "\n Functions for compression: " << numCodeCompressionSections
320+
<< "\n Duplicate functions: " << numDuplicateCodeSections
321+
<< "\n Data for compression: " << numDataCompressionSections
322+
<< "\n Duplicate data: " << numDuplicateDataSections << "\n";
323+
324+
if (!profilePath.empty()) {
325+
// Evaluate this function order for startup
326+
StringMap<std::pair<uint64_t, uint64_t>> symbolToPageNumbers;
327+
const uint64_t pageSize = (1 << 14);
328+
uint64_t currentAddress = 0;
329+
for (const auto *isec : orderedSections) {
330+
for (auto &sym : isec->getSymbols()) {
331+
if (auto *d = sym->asDefinedSymbol()) {
332+
uint64_t startAddress = currentAddress + d->getValue().value_or(0);
333+
uint64_t endAddress = startAddress + d->getSize().value_or(0);
334+
uint64_t firstPage = startAddress / pageSize;
335+
// I think the kernel might pull in a few pages when one it touched,
336+
// so it might be more accurate to force lastPage to be aligned by
337+
// 4?
338+
uint64_t lastPage = endAddress / pageSize;
339+
StringRef rootSymbol = d->getName();
340+
rootSymbol = BPSectionBase::getRootSymbol(rootSymbol);
341+
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
342+
if (isec->needResolveLinkageName(rootSymbol))
343+
symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
344+
}
345+
}
346+
currentAddress += isec->getSize();
347+
}
348+
349+
// The area under the curve F where F(t) is the total number of page
350+
// faults at step t.
351+
unsigned area = 0;
352+
for (auto &trace : reader->getTemporalProfTraces()) {
353+
SmallSet<uint64_t, 0> touchedPages;
354+
for (unsigned step = 0; step < trace.FunctionNameRefs.size(); step++) {
355+
auto traceId = trace.FunctionNameRefs[step];
356+
auto [Filename, ParsedFuncName] =
357+
getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
358+
ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
359+
auto it = symbolToPageNumbers.find(ParsedFuncName);
360+
if (it != symbolToPageNumbers.end()) {
361+
auto &[firstPage, lastPage] = it->getValue();
362+
for (uint64_t i = firstPage; i <= lastPage; i++)
363+
touchedPages.insert(i);
364+
}
365+
area += touchedPages.size();
366+
}
367+
}
368+
dbgs() << "Total area under the page fault curve: " << (float)area
369+
<< "\n";
370+
}
371+
}
372+
373+
DenseMap<const BPSectionBase *, size_t> sectionPriorities;
374+
for (const auto *isec : orderedSections)
375+
sectionPriorities[isec] = --highestAvailablePriority;
376+
return sectionPriorities;
377+
}
378+
379+
} // namespace lld

lld/Common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ set_source_files_properties("${version_inc}"
2424

2525
add_lld_library(lldCommon
2626
Args.cpp
27+
BPSectionOrdererBase.cpp
2728
CommonLinkerContext.cpp
2829
DriverDispatcher.cpp
2930
DWARF.cpp

0 commit comments

Comments
 (0)