Skip to content

Commit 9f8205d

Browse files
[MemProf] Track and report profiled sizes through cloning (#98382)
If requested, via the -memprof-report-hinted-sizes option, track the total profiled size of each MIB through the thin link, then report on the corresponding allocation coldness after all cloning is complete. To save size, a different bitcode record type is used for the allocation info when the option is specified, and the sizes are kept separate from the MIBs in the index.
1 parent 1cafde2 commit 9f8205d

File tree

10 files changed

+161
-34
lines changed

10 files changed

+161
-34
lines changed

llvm/include/llvm/Bitcode/LLVMBitCodes.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -307,7 +307,8 @@ enum GlobalValueSummarySymtabCodes {
307307
// [valueid, n x stackidindex]
308308
FS_PERMODULE_CALLSITE_INFO = 26,
309309
// Summary of per-module allocation memprof metadata.
310-
// [n x (alloc type, nummib, nummib x stackidindex)]
310+
// [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex),
311+
// [nummib x total size]?]
311312
FS_PERMODULE_ALLOC_INFO = 27,
312313
// Summary of combined index memprof callsite metadata.
313314
// [valueid, numstackindices, numver,
@@ -316,7 +317,7 @@ enum GlobalValueSummarySymtabCodes {
316317
// Summary of combined index allocation memprof metadata.
317318
// [nummib, numver,
318319
// nummib x (alloc type, numstackids, numstackids x stackidindex),
319-
// numver x version]
320+
// numver x version, [nummib x total size]?]
320321
FS_COMBINED_ALLOC_INFO = 29,
321322
FS_STACK_IDS = 30,
322323
};

llvm/include/llvm/IR/ModuleSummaryIndex.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,10 @@ struct AllocInfo {
403403
// Vector of MIBs in this memprof metadata.
404404
std::vector<MIBInfo> MIBs;
405405

406+
// If requested, keep track of total profiled sizes for each MIB. This will be
407+
// a vector of the same length and order as the MIBs vector, if non-empty.
408+
std::vector<uint64_t> TotalSizes;
409+
406410
AllocInfo(std::vector<MIBInfo> MIBs) : MIBs(std::move(MIBs)) {
407411
Versions.push_back(0);
408412
}
@@ -423,6 +427,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) {
423427
for (auto &M : AE.MIBs) {
424428
OS << "\t\t" << M << "\n";
425429
}
430+
if (!AE.TotalSizes.empty()) {
431+
OS << " TotalSizes per MIB:\n\t\t";
432+
First = true;
433+
for (uint64_t TS : AE.TotalSizes) {
434+
if (!First)
435+
OS << ", ";
436+
First = false;
437+
OS << TS << "\n";
438+
}
439+
}
426440
return OS;
427441
}
428442

@@ -1431,7 +1445,7 @@ class ModuleSummaryIndex {
14311445
// in the way some record are interpreted, like flags for instance.
14321446
// Note that incrementing this may require changes in both BitcodeReader.cpp
14331447
// and BitcodeWriter.cpp.
1434-
static constexpr uint64_t BitcodeSummaryVersion = 9;
1448+
static constexpr uint64_t BitcodeSummaryVersion = 10;
14351449

14361450
// Regular LTO module name for ASM writer
14371451
static constexpr const char *getRegularLTOModuleName() {

llvm/lib/Analysis/ModuleSummaryAnalysis.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ extern cl::opt<bool> ScalePartialSampleProfileWorkingSetSize;
8585

8686
extern cl::opt<unsigned> MaxNumVTableAnnotations;
8787

88+
extern cl::opt<bool> MemProfReportHintedSizes;
89+
8890
// Walk through the operands of a given User via worklist iteration and populate
8991
// the set of GlobalValue references encountered. Invoked either on an
9092
// Instruction or a GlobalVariable (which walks its initializer).
@@ -517,6 +519,7 @@ static void computeFunctionSummary(
517519
auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
518520
if (MemProfMD) {
519521
std::vector<MIBInfo> MIBs;
522+
std::vector<uint64_t> TotalSizes;
520523
for (auto &MDOp : MemProfMD->operands()) {
521524
auto *MIBMD = cast<const MDNode>(MDOp);
522525
MDNode *StackNode = getMIBStackNode(MIBMD);
@@ -536,8 +539,17 @@ static void computeFunctionSummary(
536539
}
537540
MIBs.push_back(
538541
MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices)));
542+
if (MemProfReportHintedSizes) {
543+
auto TotalSize = getMIBTotalSize(MIBMD);
544+
assert(TotalSize);
545+
TotalSizes.push_back(TotalSize);
546+
}
539547
}
540548
Allocs.push_back(AllocInfo(std::move(MIBs)));
549+
if (MemProfReportHintedSizes) {
550+
assert(Allocs.back().MIBs.size() == TotalSizes.size());
551+
Allocs.back().TotalSizes = std::move(TotalSizes);
552+
}
541553
} else if (!InstCallsite.empty()) {
542554
SmallVector<unsigned> StackIdIndices;
543555
for (auto StackId : InstCallsite)

llvm/lib/Bitcode/Reader/BitcodeReader.cpp

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7994,7 +7994,12 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
79947994
case bitc::FS_PERMODULE_ALLOC_INFO: {
79957995
unsigned I = 0;
79967996
std::vector<MIBInfo> MIBs;
7997-
while (I < Record.size()) {
7997+
unsigned NumMIBs = 0;
7998+
if (Version >= 10)
7999+
NumMIBs = Record[I++];
8000+
unsigned MIBsRead = 0;
8001+
while ((Version >= 10 && MIBsRead++ < NumMIBs) ||
8002+
(Version < 10 && I < Record.size())) {
79988003
assert(Record.size() - I >= 2);
79998004
AllocationType AllocType = (AllocationType)Record[I++];
80008005
unsigned NumStackEntries = Record[I++];
@@ -8007,7 +8012,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
80078012
}
80088013
MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList)));
80098014
}
8015+
std::vector<uint64_t> TotalSizes;
8016+
// We either have no sizes or NumMIBs of them.
8017+
assert(I == Record.size() || Record.size() - I == NumMIBs);
8018+
if (I < Record.size()) {
8019+
MIBsRead = 0;
8020+
while (MIBsRead++ < NumMIBs)
8021+
TotalSizes.push_back(Record[I++]);
8022+
}
80108023
PendingAllocs.push_back(AllocInfo(std::move(MIBs)));
8024+
if (!TotalSizes.empty()) {
8025+
assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
8026+
PendingAllocs.back().TotalSizes = std::move(TotalSizes);
8027+
}
80118028
break;
80128029
}
80138030

@@ -8034,8 +8051,21 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
80348051
SmallVector<uint8_t> Versions;
80358052
for (unsigned J = 0; J < NumVersions; J++)
80368053
Versions.push_back(Record[I++]);
8054+
std::vector<uint64_t> TotalSizes;
8055+
// We either have no sizes or NumMIBs of them.
8056+
assert(I == Record.size() || Record.size() - I == NumMIBs);
8057+
if (I < Record.size()) {
8058+
MIBsRead = 0;
8059+
while (MIBsRead++ < NumMIBs) {
8060+
TotalSizes.push_back(Record[I++]);
8061+
}
8062+
}
80378063
PendingAllocs.push_back(
80388064
AllocInfo(std::move(Versions), std::move(MIBs)));
8065+
if (!TotalSizes.empty()) {
8066+
assert(PendingAllocs.back().MIBs.size() == TotalSizes.size());
8067+
PendingAllocs.back().TotalSizes = std::move(TotalSizes);
8068+
}
80398069
break;
80408070
}
80418071
}

llvm/lib/Bitcode/Writer/BitcodeWriter.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4189,10 +4189,9 @@ static void writeFunctionHeapProfileRecords(
41894189
// Per module alloc versions should always have a single entry of
41904190
// value 0.
41914191
assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0));
4192-
if (!PerModule) {
4193-
Record.push_back(AI.MIBs.size());
4192+
Record.push_back(AI.MIBs.size());
4193+
if (!PerModule)
41944194
Record.push_back(AI.Versions.size());
4195-
}
41964195
for (auto &MIB : AI.MIBs) {
41974196
Record.push_back((uint8_t)MIB.AllocType);
41984197
Record.push_back(MIB.StackIdIndices.size());
@@ -4203,6 +4202,11 @@ static void writeFunctionHeapProfileRecords(
42034202
for (auto V : AI.Versions)
42044203
Record.push_back(V);
42054204
}
4205+
assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size());
4206+
if (!AI.TotalSizes.empty()) {
4207+
for (auto Size : AI.TotalSizes)
4208+
Record.push_back(Size);
4209+
}
42064210
Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO
42074211
: bitc::FS_COMBINED_ALLOC_INFO,
42084212
Record, AllocAbbrev);
@@ -4432,7 +4436,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
44324436

44334437
Abbv = std::make_shared<BitCodeAbbrev>();
44344438
Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO));
4439+
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib
44354440
// n x (alloc type, numstackids, numstackids x stackidindex)
4441+
// optional: nummib x total size
44364442
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
44374443
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
44384444
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -4576,6 +4582,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
45764582
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver
45774583
// nummib x (alloc type, numstackids, numstackids x stackidindex),
45784584
// numver x version
4585+
// optional: nummib x total size
45794586
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
45804587
Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
45814588
unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -4675,7 +4682,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
46754682
writeFunctionHeapProfileRecords(
46764683
Stream, FS, CallsiteAbbrev, AllocAbbrev,
46774684
/*PerModule*/ false,
4678-
/*GetValueId*/ [&](const ValueInfo &VI) -> unsigned {
4685+
/*GetValueId*/
4686+
[&](const ValueInfo &VI) -> unsigned {
46794687
std::optional<unsigned> ValueID = GetValueId(VI);
46804688
// This can happen in shared index files for distributed ThinLTO if
46814689
// the callee function summary is not included. Record 0 which we
@@ -4685,7 +4693,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
46854693
return 0;
46864694
return *ValueID;
46874695
},
4688-
/*GetStackIndex*/ [&](unsigned I) {
4696+
/*GetStackIndex*/
4697+
[&](unsigned I) {
46894698
// Get the corresponding index into the list of StackIds actually
46904699
// being written for this combined index (which may be a subset in
46914700
// the case of distributed indexes).

llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp

Lines changed: 68 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ cl::opt<bool> SupportsHotColdNew(
134134
cl::desc("Linking with hot/cold operator new interfaces"));
135135
} // namespace llvm
136136

137+
extern cl::opt<bool> MemProfReportHintedSizes;
138+
137139
namespace {
138140
/// CRTP base for graphs built from either IR or ThinLTO summary index.
139141
///
@@ -172,6 +174,7 @@ class CallsiteContextGraph {
172174

173175
void dump() const;
174176
void print(raw_ostream &OS) const;
177+
void printTotalSizes(raw_ostream &OS) const;
175178

176179
friend raw_ostream &operator<<(raw_ostream &OS,
177180
const CallsiteContextGraph &CCG) {
@@ -439,7 +442,7 @@ class CallsiteContextGraph {
439442
void addStackNodesForMIB(ContextNode *AllocNode,
440443
CallStack<NodeT, IteratorT> &StackContext,
441444
CallStack<NodeT, IteratorT> &CallsiteContext,
442-
AllocationType AllocType);
445+
AllocationType AllocType, uint64_t TotalSize);
443446

444447
/// Matches all callsite metadata (or summary) to the nodes created for
445448
/// allocation memprof MIB metadata, synthesizing new nodes to reflect any
@@ -611,6 +614,10 @@ class CallsiteContextGraph {
611614
/// Map from each context ID to the AllocationType assigned to that context.
612615
DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
613616

617+
/// Map from each contextID to the profiled aggregate allocation size,
618+
/// optionally populated when requested (via MemProfReportHintedSizes).
619+
DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
620+
614621
/// Identifies the context node created for a stack id when adding the MIB
615622
/// contexts to the graph. This is used to locate the context nodes when
616623
/// trying to assign the corresponding callsites with those stack ids to these
@@ -1004,18 +1011,36 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
10041011
return AllocNode;
10051012
}
10061013

1014+
static std::string getAllocTypeString(uint8_t AllocTypes) {
1015+
if (!AllocTypes)
1016+
return "None";
1017+
std::string Str;
1018+
if (AllocTypes & (uint8_t)AllocationType::NotCold)
1019+
Str += "NotCold";
1020+
if (AllocTypes & (uint8_t)AllocationType::Cold)
1021+
Str += "Cold";
1022+
return Str;
1023+
}
1024+
10071025
template <typename DerivedCCG, typename FuncTy, typename CallTy>
10081026
template <class NodeT, class IteratorT>
10091027
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
10101028
ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1011-
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
1029+
CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1030+
uint64_t TotalSize) {
1031+
assert(!MemProfReportHintedSizes || TotalSize > 0);
10121032
// Treating the hot alloc type as NotCold before the disambiguation for "hot"
10131033
// is done.
10141034
if (AllocType == AllocationType::Hot)
10151035
AllocType = AllocationType::NotCold;
10161036

10171037
ContextIdToAllocationType[++LastContextId] = AllocType;
10181038

1039+
if (MemProfReportHintedSizes) {
1040+
assert(TotalSize);
1041+
ContextIdToTotalSize[LastContextId] = TotalSize;
1042+
}
1043+
10191044
// Update alloc type and context ids for this MIB.
10201045
AllocNode->AllocTypes |= (uint8_t)AllocType;
10211046

@@ -1060,6 +1085,10 @@ CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
10601085
assert(ContextIdToAllocationType.count(OldId));
10611086
// The new context has the same allocation type as original.
10621087
ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1088+
// For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1089+
// up the size. Assume that if we are able to duplicate context ids that we
1090+
// will be able to disambiguate all copies.
1091+
ContextIdToTotalSize[LastContextId] = 0;
10631092
}
10641093
return NewContextIds;
10651094
}
@@ -1663,7 +1692,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
16631692
CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
16641693
addStackNodesForMIB<MDNode, MDNode::op_iterator>(
16651694
AllocNode, StackContext, CallsiteContext,
1666-
getMIBAllocType(MIBMD));
1695+
getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
16671696
}
16681697
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
16691698
// Memprof and callsite metadata on memory allocations no longer
@@ -1735,12 +1764,20 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
17351764
// stack ids on the allocation call during ModuleSummaryAnalysis.
17361765
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
17371766
EmptyContext;
1767+
unsigned I = 0;
1768+
assert(!MemProfReportHintedSizes ||
1769+
AN.TotalSizes.size() == AN.MIBs.size());
17381770
// Now add all of the MIBs and their stack nodes.
17391771
for (auto &MIB : AN.MIBs) {
17401772
CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
17411773
StackContext(&MIB);
1774+
uint64_t TotalSize = 0;
1775+
if (MemProfReportHintedSizes)
1776+
TotalSize = AN.TotalSizes[I];
17421777
addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1743-
AllocNode, StackContext, EmptyContext, MIB.AllocType);
1778+
AllocNode, StackContext, EmptyContext, MIB.AllocType,
1779+
TotalSize);
1780+
I++;
17441781
}
17451782
assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
17461783
// Initialize version 0 on the summary alloc node to the current alloc
@@ -2171,17 +2208,6 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc(
21712208
return true;
21722209
}
21732210

2174-
static std::string getAllocTypeString(uint8_t AllocTypes) {
2175-
if (!AllocTypes)
2176-
return "None";
2177-
std::string Str;
2178-
if (AllocTypes & (uint8_t)AllocationType::NotCold)
2179-
Str += "NotCold";
2180-
if (AllocTypes & (uint8_t)AllocationType::Cold)
2181-
Str += "Cold";
2182-
return Str;
2183-
}
2184-
21852211
template <typename DerivedCCG, typename FuncTy, typename CallTy>
21862212
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
21872213
const {
@@ -2261,6 +2287,30 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
22612287
}
22622288
}
22632289

2290+
template <typename DerivedCCG, typename FuncTy, typename CallTy>
2291+
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2292+
raw_ostream &OS) const {
2293+
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2294+
for (const auto Node : nodes<GraphType>(this)) {
2295+
if (Node->isRemoved())
2296+
continue;
2297+
if (!Node->IsAllocation)
2298+
continue;
2299+
DenseSet<uint32_t> ContextIds = Node->getContextIds();
2300+
std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2301+
std::sort(SortedIds.begin(), SortedIds.end());
2302+
for (auto Id : SortedIds) {
2303+
auto SizeI = ContextIdToTotalSize.find(Id);
2304+
assert(SizeI != ContextIdToTotalSize.end());
2305+
auto TypeI = ContextIdToAllocationType.find(Id);
2306+
assert(TypeI != ContextIdToAllocationType.end());
2307+
OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2308+
<< " with total size " << SizeI->second << " is "
2309+
<< getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2310+
}
2311+
}
2312+
}
2313+
22642314
template <typename DerivedCCG, typename FuncTy, typename CallTy>
22652315
void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
22662316
using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
@@ -3797,6 +3847,9 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
37973847
if (ExportToDot)
37983848
exportToDot("clonefuncassign");
37993849

3850+
if (MemProfReportHintedSizes)
3851+
printTotalSizes(errs());
3852+
38003853
return Changed;
38013854
}
38023855

llvm/test/Bitcode/summary_version.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s
33

44
; CHECK: <GLOBALVAL_SUMMARY_BLOCK
5-
; CHECK: <VERSION op0=9/>
5+
; CHECK: <VERSION op0=10/>
66

77

88

0 commit comments

Comments
 (0)