From 73358b0b8640744b1247c17ae25450ff9d52642a Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Wed, 10 Jul 2024 10:02:03 -0700 Subject: [PATCH 1/5] [MemProf] Track and report profiled sizes through cloning If requested, via the -memprof-report-hinted-sizes option, track the total profiled size of each MIB through the thin link, then report on the corresponding allocation coldness after all cloning is complete. To save size, a different bitcode record type is used for the allocation info when the option is specified, and the sizes are kept separate from the MIBs in the index. --- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 10 +++ llvm/include/llvm/IR/ModuleSummaryIndex.h | 14 ++++ llvm/include/llvm/IR/ModuleSummaryIndexYAML.h | 3 +- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 16 +++- llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp | 2 + llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 34 +++++++- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 59 +++++++++++--- .../IPO/MemProfContextDisambiguation.cpp | 81 +++++++++++++++---- llvm/test/ThinLTO/X86/memprof-basic.ll | 17 ++-- .../MemProfContextDisambiguation/basic.ll | 11 ++- 10 files changed, 205 insertions(+), 42 deletions(-) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 5b5e08b5cbc3f..9f989aaf22779 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -319,6 +319,16 @@ enum GlobalValueSummarySymtabCodes { // numver x version] FS_COMBINED_ALLOC_INFO = 29, FS_STACK_IDS = 30, + // Summary of per-module allocation memprof metadata when we are tracking + // total sizes. + // [n x (alloc type, total size, nummib, nummib x stackidindex)] + FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES = 31, + // Summary of combined index allocation memprof metadata when we are tracking + // total sizes. + // [nummib, numver, + // nummib x (alloc type, total size, numstackids, + // numstackids x stackidindex), numver x version] + FS_COMBINED_ALLOC_INFO_TOTAL_SIZES = 32, }; enum MetadataCodes { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 31271ed388e54..950c0ed824dbb 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -403,6 +403,10 @@ struct AllocInfo { // Vector of MIBs in this memprof metadata. std::vector MIBs; + // If requested, keep track of total profiled sizes for each MIB. This will be + // a vector of the same length and order as the MIBs vector. + std::unique_ptr> TotalSizes; + AllocInfo(std::vector MIBs) : MIBs(std::move(MIBs)) { Versions.push_back(0); } @@ -423,6 +427,16 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) { for (auto &M : AE.MIBs) { OS << "\t\t" << M << "\n"; } + if (AE.TotalSizes) { + OS << " TotalSizes per MIB:\n\t\t"; + First = true; + for (auto &TS : *AE.TotalSizes) { + if (!First) + OS << ", "; + First = false; + OS << TS << "\n"; + } + } return OS; } diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h index b2747d24c5396..26bf08f9f7439 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -224,6 +224,7 @@ template <> struct CustomMappingTraits { V.emplace(RefGUID, /*IsAnalysis=*/false); Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID))); } + std::vector EmptyAllocInfo; Elem.SummaryList.push_back(std::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), @@ -238,7 +239,7 @@ template <> struct CustomMappingTraits { std::move(FSum.TypeTestAssumeConstVCalls), std::move(FSum.TypeCheckedLoadConstVCalls), ArrayRef{}, ArrayRef{}, - ArrayRef{})); + std::move(EmptyAllocInfo))); } } static void output(IO &io, GlobalValueSummaryMapTy &V) { diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index 94ac0484f5ec7..c3b6fd8bc20af 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -85,6 +85,8 @@ extern cl::opt ScalePartialSampleProfileWorkingSetSize; extern cl::opt MaxNumVTableAnnotations; +extern cl::opt MemProfReportHintedSizes; + // Walk through the operands of a given User via worklist iteration and populate // the set of GlobalValue references encountered. Invoked either on an // Instruction or a GlobalVariable (which walks its initializer). @@ -517,6 +519,7 @@ static void computeFunctionSummary( auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); if (MemProfMD) { std::vector MIBs; + std::vector TotalSizes; for (auto &MDOp : MemProfMD->operands()) { auto *MIBMD = cast(MDOp); MDNode *StackNode = getMIBStackNode(MIBMD); @@ -536,8 +539,18 @@ static void computeFunctionSummary( } MIBs.push_back( MIBInfo(getMIBAllocType(MIBMD), std::move(StackIdIndices))); + if (MemProfReportHintedSizes) { + auto TotalSize = getMIBTotalSize(MIBMD); + assert(TotalSize); + TotalSizes.push_back(TotalSize); + } } Allocs.push_back(AllocInfo(std::move(MIBs))); + if (MemProfReportHintedSizes) { + assert(Allocs.back().MIBs.size() == TotalSizes.size()); + Allocs.back().TotalSizes = + std::make_unique>(std::move(TotalSizes)); + } } else if (!InstCallsite.empty()) { SmallVector StackIdIndices; for (auto StackId : InstCallsite) @@ -935,6 +948,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( CantBePromoted.insert(GV->getGUID()); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { + std::vector EmptyAllocInfo; std::unique_ptr Summary = std::make_unique( GVFlags, /*InstCount=*/0, @@ -957,7 +971,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( ArrayRef{}, ArrayRef{}, ArrayRef{}, - ArrayRef{}, ArrayRef{}); + ArrayRef{}, std::move(EmptyAllocInfo)); Index.addGlobalValueSummary(*GV, std::move(Summary)); } else { std::unique_ptr Summary = diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index b7ed9cdf63145..82aa7b7e077a1 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -328,6 +328,8 @@ GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO) STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO) STRINGIFY_CODE(FS, STACK_IDS) + STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO_TOTAL_SIZES) + STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO_TOTAL_SIZES) } case bitc::METADATA_ATTACHMENT_ID: switch (CodeID) { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index f56b2b32ff98f..1cba859a3fade 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7991,12 +7991,19 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { break; } - case bitc::FS_PERMODULE_ALLOC_INFO: { + case bitc::FS_PERMODULE_ALLOC_INFO: + case bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES: { + bool HasTotalSizes = BitCode == bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES; unsigned I = 0; std::vector MIBs; + std::vector TotalSizes; while (I < Record.size()) { - assert(Record.size() - I >= 2); + assert(Record.size() - I >= (HasTotalSizes ? 3 : 2)); AllocationType AllocType = (AllocationType)Record[I++]; + if (HasTotalSizes) { + TotalSizes.push_back(Record[I++]); + assert(TotalSizes.back()); + } unsigned NumStackEntries = Record[I++]; assert(Record.size() - I >= NumStackEntries); SmallVector StackIdList; @@ -8008,18 +8015,31 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); } PendingAllocs.push_back(AllocInfo(std::move(MIBs))); + assert(HasTotalSizes != TotalSizes.empty()); + if (HasTotalSizes) { + assert(PendingAllocs.back().MIBs.size() == TotalSizes.size()); + PendingAllocs.back().TotalSizes = + std::make_unique>(std::move(TotalSizes)); + } break; } - case bitc::FS_COMBINED_ALLOC_INFO: { + case bitc::FS_COMBINED_ALLOC_INFO: + case bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES: { + bool HasTotalSizes = BitCode == bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES; unsigned I = 0; std::vector MIBs; + std::vector TotalSizes; unsigned NumMIBs = Record[I++]; unsigned NumVersions = Record[I++]; unsigned MIBsRead = 0; while (MIBsRead++ < NumMIBs) { - assert(Record.size() - I >= 2); + assert(Record.size() - I >= (HasTotalSizes ? 3 : 2)); AllocationType AllocType = (AllocationType)Record[I++]; + if (HasTotalSizes) { + TotalSizes.push_back(Record[I++]); + assert(TotalSizes.back()); + } unsigned NumStackEntries = Record[I++]; assert(Record.size() - I >= NumStackEntries); SmallVector StackIdList; @@ -8036,6 +8056,12 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { Versions.push_back(Record[I++]); PendingAllocs.push_back( AllocInfo(std::move(Versions), std::move(MIBs))); + assert(HasTotalSizes != TotalSizes.empty()); + if (HasTotalSizes) { + assert(PendingAllocs.back().MIBs.size() == TotalSizes.size()); + PendingAllocs.back().TotalSizes = + std::make_unique>(std::move(TotalSizes)); + } break; } } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 3378931065f9b..7b8f6371b5698 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -230,7 +230,8 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase { void writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev, - unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F); + unsigned CallsiteAbbrev, unsigned AllocAbbrev, + unsigned AllocTotalSizeAbbrev, const Function &F); void writeModuleLevelReferences(const GlobalVariable &V, SmallVector &NameVals, unsigned FSModRefsAbbrev, @@ -4158,7 +4159,7 @@ static void writeTypeIdCompatibleVtableSummaryRecord( static void writeFunctionHeapProfileRecords( BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev, - unsigned AllocAbbrev, bool PerModule, + unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, bool PerModule, std::function GetValueID, std::function GetStackIndex) { SmallVector Record; @@ -4193,19 +4194,31 @@ static void writeFunctionHeapProfileRecords( Record.push_back(AI.MIBs.size()); Record.push_back(AI.Versions.size()); } + unsigned I = 0; + assert(!AI.TotalSizes || AI.TotalSizes->size() == AI.MIBs.size()); for (auto &MIB : AI.MIBs) { Record.push_back((uint8_t)MIB.AllocType); + if (AI.TotalSizes) { + Record.push_back((*AI.TotalSizes)[I]); + assert((*AI.TotalSizes)[I]); + } Record.push_back(MIB.StackIdIndices.size()); for (auto Id : MIB.StackIdIndices) Record.push_back(GetStackIndex(Id)); + I++; } if (!PerModule) { for (auto V : AI.Versions) Record.push_back(V); } - Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO - : bitc::FS_COMBINED_ALLOC_INFO, - Record, AllocAbbrev); + unsigned Code = + PerModule ? (AI.TotalSizes ? bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES + : bitc::FS_PERMODULE_ALLOC_INFO) + : (AI.TotalSizes ? bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES + : bitc::FS_COMBINED_ALLOC_INFO); + unsigned AllocAbbrevToUse = + AI.TotalSizes ? AllocTotalSizeAbbrev : AllocAbbrev; + Stream.EmitRecord(Code, Record, AllocAbbrevToUse); } } @@ -4214,7 +4227,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsRelBFAbbrev, unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev, - unsigned AllocAbbrev, const Function &F) { + unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, const Function &F) { NameVals.push_back(ValueID); FunctionSummary *FS = cast(Summary); @@ -4225,7 +4238,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( }); writeFunctionHeapProfileRecords( - Stream, FS, CallsiteAbbrev, AllocAbbrev, + Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev, /*PerModule*/ true, /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); }, /*GetStackIndex*/ [&](unsigned I) { return I; }); @@ -4437,6 +4450,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES)); + // n x (alloc type, total size, numstackids, numstackids x stackidindex) + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + SmallVector NameVals; // Iterate over the list of functions instead of the Index to // ensure the ordering is stable. @@ -4454,9 +4474,10 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { continue; } auto *Summary = VI.getSummaryList()[0].get(); - writePerModuleFunctionSummaryRecord( - NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev, - FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F); + writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F), + FSCallsRelBFAbbrev, + FSCallsProfileAbbrev, CallsiteAbbrev, + AllocAbbrev, AllocTotalSizeAbbrev, F); } // Capture references from GlobalVariable initializers, which are outside @@ -4586,6 +4607,16 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return DecSummaries->count(GVS); }; + Abbv = std::make_shared(); + Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver + // nummib x (alloc type, total size, numstackids, numstackids x stackidindex), + // numver x version + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); + unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv)); + // The aliases are emitted as a post-pass, and will point to the value // id of the aliasee. Save them in a vector for post-processing. SmallVector Aliases; @@ -4673,9 +4704,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { getReferencedTypeIds(FS, ReferencedTypeIds); writeFunctionHeapProfileRecords( - Stream, FS, CallsiteAbbrev, AllocAbbrev, + Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev, /*PerModule*/ false, - /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned { + /*GetValueId*/ + [&](const ValueInfo &VI) -> unsigned { std::optional ValueID = GetValueId(VI); // This can happen in shared index files for distributed ThinLTO if // the callee function summary is not included. Record 0 which we @@ -4685,7 +4717,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return 0; return *ValueID; }, - /*GetStackIndex*/ [&](unsigned I) { + /*GetStackIndex*/ + [&](unsigned I) { // Get the corresponding index into the list of StackIds actually // being written for this combined index (which may be a subset in // the case of distributed indexes). diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index c1e5ab1a2b561..f5713d31e263f 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -134,6 +134,8 @@ cl::opt SupportsHotColdNew( cl::desc("Linking with hot/cold operator new interfaces")); } // namespace llvm +extern cl::opt MemProfReportHintedSizes; + namespace { /// CRTP base for graphs built from either IR or ThinLTO summary index. /// @@ -172,6 +174,7 @@ class CallsiteContextGraph { void dump() const; void print(raw_ostream &OS) const; + void printTotalSizes(raw_ostream &OS) const; friend raw_ostream &operator<<(raw_ostream &OS, const CallsiteContextGraph &CCG) { @@ -439,7 +442,7 @@ class CallsiteContextGraph { void addStackNodesForMIB(ContextNode *AllocNode, CallStack &StackContext, CallStack &CallsiteContext, - AllocationType AllocType); + AllocationType AllocType, uint64_t TotalSize); /// Matches all callsite metadata (or summary) to the nodes created for /// allocation memprof MIB metadata, synthesizing new nodes to reflect any @@ -611,6 +614,8 @@ class CallsiteContextGraph { /// Map from each context ID to the AllocationType assigned to that context. DenseMap ContextIdToAllocationType; + std::map ContextIdToTotalSize; + /// Identifies the context node created for a stack id when adding the MIB /// contexts to the graph. This is used to locate the context nodes when /// trying to assign the corresponding callsites with those stack ids to these @@ -1004,11 +1009,24 @@ CallsiteContextGraph::addAllocNode( return AllocNode; } +static std::string getAllocTypeString(uint8_t AllocTypes) { + if (!AllocTypes) + return "None"; + std::string Str; + if (AllocTypes & (uint8_t)AllocationType::NotCold) + Str += "NotCold"; + if (AllocTypes & (uint8_t)AllocationType::Cold) + Str += "Cold"; + return Str; +} + template template void CallsiteContextGraph::addStackNodesForMIB( ContextNode *AllocNode, CallStack &StackContext, - CallStack &CallsiteContext, AllocationType AllocType) { + CallStack &CallsiteContext, AllocationType AllocType, + uint64_t TotalSize) { + assert(!MemProfReportHintedSizes || TotalSize > 0); // Treating the hot alloc type as NotCold before the disambiguation for "hot" // is done. if (AllocType == AllocationType::Hot) @@ -1016,6 +1034,11 @@ void CallsiteContextGraph::addStackNodesForMIB( ContextIdToAllocationType[++LastContextId] = AllocType; + if (MemProfReportHintedSizes) { + assert(TotalSize); + ContextIdToTotalSize[LastContextId] = TotalSize; + } + // Update alloc type and context ids for this MIB. AllocNode->AllocTypes |= (uint8_t)AllocType; @@ -1060,6 +1083,10 @@ CallsiteContextGraph::duplicateContextIds( assert(ContextIdToAllocationType.count(OldId)); // The new context has the same allocation type as original. ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; + // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy + // up the size. Assume that if we are able to duplicate context ids that we + // will be able to disambiguate all copies. + ContextIdToTotalSize[LastContextId] = 0; } return NewContextIds; } @@ -1663,7 +1690,7 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( CallStack StackContext(StackNode); addStackNodesForMIB( AllocNode, StackContext, CallsiteContext, - getMIBAllocType(MIBMD)); + getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD)); } assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); // Memprof and callsite metadata on memory allocations no longer @@ -1735,12 +1762,20 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( // stack ids on the allocation call during ModuleSummaryAnalysis. CallStack::const_iterator> EmptyContext; + unsigned I = 0; + assert(!MemProfReportHintedSizes || + (AN.TotalSizes && AN.TotalSizes->size() == AN.MIBs.size())); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack::const_iterator> StackContext(&MIB); + uint64_t TotalSize = 0; + if (MemProfReportHintedSizes) + TotalSize = (*AN.TotalSizes)[I]; addStackNodesForMIB::const_iterator>( - AllocNode, StackContext, EmptyContext, MIB.AllocType); + AllocNode, StackContext, EmptyContext, MIB.AllocType, + TotalSize); + I++; } assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); // Initialize version 0 on the summary alloc node to the current alloc @@ -2171,17 +2206,6 @@ bool IndexCallsiteContextGraph::calleeMatchesFunc( return true; } -static std::string getAllocTypeString(uint8_t AllocTypes) { - if (!AllocTypes) - return "None"; - std::string Str; - if (AllocTypes & (uint8_t)AllocationType::NotCold) - Str += "NotCold"; - if (AllocTypes & (uint8_t)AllocationType::Cold) - Str += "Cold"; - return Str; -} - template void CallsiteContextGraph::ContextNode::dump() const { @@ -2261,6 +2285,30 @@ void CallsiteContextGraph::print( } } +template +void CallsiteContextGraph::printTotalSizes( + raw_ostream &OS) const { + using GraphType = const CallsiteContextGraph *; + for (const auto Node : nodes(this)) { + if (Node->isRemoved()) + continue; + if (!Node->IsAllocation) + continue; + auto ContextIds = Node->getContextIds(); + std::vector SortedIds(ContextIds.begin(), ContextIds.end()); + std::sort(SortedIds.begin(), SortedIds.end()); + for (auto Id : SortedIds) { + auto SizeI = ContextIdToTotalSize.find(Id); + assert(SizeI != ContextIdToTotalSize.end()); + auto TypeI = ContextIdToAllocationType.find(Id); + assert(TypeI != ContextIdToAllocationType.end()); + OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id + << " with total size " << SizeI->second << " is " + << getAllocTypeString(Node->AllocTypes) << " after cloning\n"; + } + } +} + template void CallsiteContextGraph::check() const { using GraphType = const CallsiteContextGraph *; @@ -3797,6 +3845,9 @@ bool CallsiteContextGraph::process() { if (ExportToDot) exportToDot("clonefuncassign"); + if (MemProfReportHintedSizes) + printTotalSizes(errs()); + return Changed; } diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll index 54e01e5fcdf95..e080279a16412 100644 --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -34,7 +34,7 @@ ;; -stats requires asserts ; REQUIRES: asserts -; RUN: opt -thinlto-bc %s >%t.o +; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o ; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ ; RUN: -supports-hot-cold-new \ ; RUN: -r=%t.o,main,plx \ @@ -43,9 +43,11 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ +; RUN: -memprof-report-hinted-sizes \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS +; RUN: --check-prefix=STATS --check-prefix=STATS-BE --check-prefix=REMARKS \ +; RUN: --check-prefix=SIZES ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -64,9 +66,10 @@ ; RUN: -r=%t.o,_Znam, \ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t2. \ +; RUN: -memprof-report-hinted-sizes \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation \ ; RUN: -o %t2.out 2>&1 | FileCheck %s --check-prefix=DUMP \ -; RUN: --check-prefix=STATS +; RUN: --check-prefix=STATS --check-prefix=SIZES ; RUN: cat %t2.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -125,9 +128,9 @@ attributes #0 = { noinline optnone } !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{!3, !5} -!3 = !{!4, !"notcold"} +!3 = !{!4, !"notcold", i64 100} !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} -!5 = !{!6, !"cold"} +!5 = !{!6, !"cold", i64 400} !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} !7 = !{i64 9086428284934609951} !8 = !{i64 -5964873800580613432} @@ -265,6 +268,10 @@ attributes #0 = { noinline optnone } ; DUMP: Clone of [[BAR]] +; SIZES: NotCold context 1 with total size 100 is NotCold after cloning +; SIZES: Cold context 2 with total size 400 is Cold after cloning + + ; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 ; REMARKS: created clone _Z3barv.memprof.1 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll index 483582c6ced95..c685a413718a6 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -38,8 +38,9 @@ ; RUN: -memprof-verify-ccg -memprof-verify-nodes -memprof-dump-ccg \ ; RUN: -memprof-export-to-dot -memprof-dot-file-path-prefix=%t. \ ; RUN: -stats -pass-remarks=memprof-context-disambiguation \ +; RUN: -memprof-report-hinted-sizes \ ; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=DUMP --check-prefix=IR \ -; RUN: --check-prefix=STATS --check-prefix=REMARKS +; RUN: --check-prefix=STATS --check-prefix=REMARKS --check-prefix=SIZES ; RUN: cat %t.ccg.postbuild.dot | FileCheck %s --check-prefix=DOT ;; We should have cloned bar, baz, and foo, for the cold memory allocation. @@ -105,9 +106,9 @@ attributes #6 = { builtin } !0 = !{i64 8632435727821051414} !1 = !{i64 -3421689549917153178} !2 = !{!3, !5} -!3 = !{!4, !"notcold"} +!3 = !{!4, !"notcold", i64 100} !4 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 8632435727821051414} -!5 = !{!6, !"cold"} +!5 = !{!6, !"cold", i64 400} !6 = !{i64 9086428284934609951, i64 -5964873800580613432, i64 2732490490862098848, i64 -3421689549917153178} !7 = !{i64 9086428284934609951} !8 = !{i64 -5964873800580613432} @@ -249,6 +250,10 @@ attributes #6 = { builtin } ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold +; SIZES: NotCold context 1 with total size 100 is NotCold after cloning +; SIZES: Cold context 2 with total size 400 is Cold after cloning + + ; IR: define {{.*}} @main ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated From f320adfa050b3bec2e5094ef7d97f4cdc9d7c5e7 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 11 Jul 2024 08:52:20 -0700 Subject: [PATCH 2/5] Address comments --- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 6 ++---- llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 7b8f6371b5698..4463eb48b9811 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -4706,8 +4706,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { writeFunctionHeapProfileRecords( Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev, /*PerModule*/ false, - /*GetValueId*/ - [&](const ValueInfo &VI) -> unsigned { + /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned { std::optional ValueID = GetValueId(VI); // This can happen in shared index files for distributed ThinLTO if // the callee function summary is not included. Record 0 which we @@ -4717,8 +4716,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return 0; return *ValueID; }, - /*GetStackIndex*/ - [&](unsigned I) { + /*GetStackIndex*/ [&](unsigned I) { // Get the corresponding index into the list of StackIds actually // being written for this combined index (which may be a subset in // the case of distributed indexes). diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index f5713d31e263f..01138555a8fbc 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -614,7 +614,9 @@ class CallsiteContextGraph { /// Map from each context ID to the AllocationType assigned to that context. DenseMap ContextIdToAllocationType; - std::map ContextIdToTotalSize; + /// Map from each contextID to the profiled aggregate allocation size, + /// optionally populated when requested (via MemProfReportHintedSizes). + DenseMap ContextIdToTotalSize; /// Identifies the context node created for a stack id when adding the MIB /// contexts to the graph. This is used to locate the context nodes when @@ -2294,7 +2296,7 @@ void CallsiteContextGraph::printTotalSizes( continue; if (!Node->IsAllocation) continue; - auto ContextIds = Node->getContextIds(); + DenseSet ContextIds = Node->getContextIds(); std::vector SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) { From 8be1c3cc1eee302c15fe105ca1ac5f38bde2fcec Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 11 Jul 2024 13:59:18 -0700 Subject: [PATCH 3/5] Address comments --- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 15 +--- llvm/include/llvm/IR/ModuleSummaryIndex.h | 10 +-- llvm/include/llvm/IR/ModuleSummaryIndexYAML.h | 3 +- llvm/lib/Analysis/ModuleSummaryAnalysis.cpp | 6 +- llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp | 2 - llvm/lib/Bitcode/Reader/BitcodeReader.cpp | 58 ++++++++------- llvm/lib/Bitcode/Writer/BitcodeWriter.cpp | 72 +++++++------------ .../IPO/MemProfContextDisambiguation.cpp | 4 +- llvm/test/Bitcode/summary_version.ll | 2 +- .../thinlto-func-summary-vtableref-pgo.ll | 2 +- 10 files changed, 71 insertions(+), 103 deletions(-) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 9f989aaf22779..4e6ef4c1a8c31 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -307,7 +307,8 @@ enum GlobalValueSummarySymtabCodes { // [valueid, n x stackidindex] FS_PERMODULE_CALLSITE_INFO = 26, // Summary of per-module allocation memprof metadata. - // [n x (alloc type, nummib, nummib x stackidindex)] + // [nummib, nummib x (alloc type, nummib, nummib x stackidindex), + // [nummib x total size]?] FS_PERMODULE_ALLOC_INFO = 27, // Summary of combined index memprof callsite metadata. // [valueid, numstackindices, numver, @@ -316,19 +317,9 @@ enum GlobalValueSummarySymtabCodes { // Summary of combined index allocation memprof metadata. // [nummib, numver, // nummib x (alloc type, numstackids, numstackids x stackidindex), - // numver x version] + // numver x version, [nummib x total size]?] FS_COMBINED_ALLOC_INFO = 29, FS_STACK_IDS = 30, - // Summary of per-module allocation memprof metadata when we are tracking - // total sizes. - // [n x (alloc type, total size, nummib, nummib x stackidindex)] - FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES = 31, - // Summary of combined index allocation memprof metadata when we are tracking - // total sizes. - // [nummib, numver, - // nummib x (alloc type, total size, numstackids, - // numstackids x stackidindex), numver x version] - FS_COMBINED_ALLOC_INFO_TOTAL_SIZES = 32, }; enum MetadataCodes { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 950c0ed824dbb..949e60ffd1dbf 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -404,8 +404,8 @@ struct AllocInfo { std::vector MIBs; // If requested, keep track of total profiled sizes for each MIB. This will be - // a vector of the same length and order as the MIBs vector. - std::unique_ptr> TotalSizes; + // a vector of the same length and order as the MIBs vector, if non-empty. + std::vector TotalSizes; AllocInfo(std::vector MIBs) : MIBs(std::move(MIBs)) { Versions.push_back(0); @@ -427,10 +427,10 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) { for (auto &M : AE.MIBs) { OS << "\t\t" << M << "\n"; } - if (AE.TotalSizes) { + if (!AE.TotalSizes.empty()) { OS << " TotalSizes per MIB:\n\t\t"; First = true; - for (auto &TS : *AE.TotalSizes) { + for (auto &TS : AE.TotalSizes) { if (!First) OS << ", "; First = false; @@ -1445,7 +1445,7 @@ class ModuleSummaryIndex { // in the way some record are interpreted, like flags for instance. // Note that incrementing this may require changes in both BitcodeReader.cpp // and BitcodeWriter.cpp. - static constexpr uint64_t BitcodeSummaryVersion = 9; + static constexpr uint64_t BitcodeSummaryVersion = 10; // Regular LTO module name for ASM writer static constexpr const char *getRegularLTOModuleName() { diff --git a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h index 26bf08f9f7439..b2747d24c5396 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndexYAML.h @@ -224,7 +224,6 @@ template <> struct CustomMappingTraits { V.emplace(RefGUID, /*IsAnalysis=*/false); Refs.push_back(ValueInfo(/*IsAnalysis=*/false, &*V.find(RefGUID))); } - std::vector EmptyAllocInfo; Elem.SummaryList.push_back(std::make_unique( GlobalValueSummary::GVFlags( static_cast(FSum.Linkage), @@ -239,7 +238,7 @@ template <> struct CustomMappingTraits { std::move(FSum.TypeTestAssumeConstVCalls), std::move(FSum.TypeCheckedLoadConstVCalls), ArrayRef{}, ArrayRef{}, - std::move(EmptyAllocInfo))); + ArrayRef{})); } } static void output(IO &io, GlobalValueSummaryMapTy &V) { diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp index c3b6fd8bc20af..e9490ccba8215 100644 --- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -548,8 +548,7 @@ static void computeFunctionSummary( Allocs.push_back(AllocInfo(std::move(MIBs))); if (MemProfReportHintedSizes) { assert(Allocs.back().MIBs.size() == TotalSizes.size()); - Allocs.back().TotalSizes = - std::make_unique>(std::move(TotalSizes)); + Allocs.back().TotalSizes = std::move(TotalSizes); } } else if (!InstCallsite.empty()) { SmallVector StackIdIndices; @@ -948,7 +947,6 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( CantBePromoted.insert(GV->getGUID()); // Create the appropriate summary type. if (Function *F = dyn_cast(GV)) { - std::vector EmptyAllocInfo; std::unique_ptr Summary = std::make_unique( GVFlags, /*InstCount=*/0, @@ -971,7 +969,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( ArrayRef{}, ArrayRef{}, ArrayRef{}, - ArrayRef{}, std::move(EmptyAllocInfo)); + ArrayRef{}, ArrayRef{}); Index.addGlobalValueSummary(*GV, std::move(Summary)); } else { std::unique_ptr Summary = diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp index 82aa7b7e077a1..b7ed9cdf63145 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp @@ -328,8 +328,6 @@ GetCodeName(unsigned CodeID, unsigned BlockID, STRINGIFY_CODE(FS, COMBINED_CALLSITE_INFO) STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO) STRINGIFY_CODE(FS, STACK_IDS) - STRINGIFY_CODE(FS, PERMODULE_ALLOC_INFO_TOTAL_SIZES) - STRINGIFY_CODE(FS, COMBINED_ALLOC_INFO_TOTAL_SIZES) } case bitc::METADATA_ATTACHMENT_ID: switch (CodeID) { diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 1cba859a3fade..6203c6e5119d1 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7991,19 +7991,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { break; } - case bitc::FS_PERMODULE_ALLOC_INFO: - case bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES: { - bool HasTotalSizes = BitCode == bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES; + case bitc::FS_PERMODULE_ALLOC_INFO: { unsigned I = 0; std::vector MIBs; - std::vector TotalSizes; - while (I < Record.size()) { - assert(Record.size() - I >= (HasTotalSizes ? 3 : 2)); + unsigned NumMIBs = 0; + if (Version >= 10) + NumMIBs = Record[I++]; + unsigned MIBsRead = 0; + while ((Version >= 10 && MIBsRead++ < NumMIBs) || + (Version < 10 && I < Record.size())) { + assert(Record.size() - I >= 2); AllocationType AllocType = (AllocationType)Record[I++]; - if (HasTotalSizes) { - TotalSizes.push_back(Record[I++]); - assert(TotalSizes.back()); - } unsigned NumStackEntries = Record[I++]; assert(Record.size() - I >= NumStackEntries); SmallVector StackIdList; @@ -8014,32 +8012,31 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { } MIBs.push_back(MIBInfo(AllocType, std::move(StackIdList))); } + std::vector TotalSizes; + // We either have no sizes or NumMIBs of them. + assert(I == Record.size() || Record.size() - I == NumMIBs); + if (I < Record.size()) { + MIBsRead = 0; + while (MIBsRead++ < NumMIBs) + TotalSizes.push_back(Record[I++]); + } PendingAllocs.push_back(AllocInfo(std::move(MIBs))); - assert(HasTotalSizes != TotalSizes.empty()); - if (HasTotalSizes) { + if (!TotalSizes.empty()) { assert(PendingAllocs.back().MIBs.size() == TotalSizes.size()); - PendingAllocs.back().TotalSizes = - std::make_unique>(std::move(TotalSizes)); + PendingAllocs.back().TotalSizes = std::move(TotalSizes); } break; } - case bitc::FS_COMBINED_ALLOC_INFO: - case bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES: { - bool HasTotalSizes = BitCode == bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES; + case bitc::FS_COMBINED_ALLOC_INFO: { unsigned I = 0; std::vector MIBs; - std::vector TotalSizes; unsigned NumMIBs = Record[I++]; unsigned NumVersions = Record[I++]; unsigned MIBsRead = 0; while (MIBsRead++ < NumMIBs) { - assert(Record.size() - I >= (HasTotalSizes ? 3 : 2)); + assert(Record.size() - I >= 2); AllocationType AllocType = (AllocationType)Record[I++]; - if (HasTotalSizes) { - TotalSizes.push_back(Record[I++]); - assert(TotalSizes.back()); - } unsigned NumStackEntries = Record[I++]; assert(Record.size() - I >= NumStackEntries); SmallVector StackIdList; @@ -8054,13 +8051,20 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) { SmallVector Versions; for (unsigned J = 0; J < NumVersions; J++) Versions.push_back(Record[I++]); + std::vector TotalSizes; + // We either have no sizes or NumMIBs of them. + assert(I == Record.size() || Record.size() - I == NumMIBs); + if (I < Record.size()) { + MIBsRead = 0; + while (MIBsRead++ < NumMIBs) { + TotalSizes.push_back(Record[I++]); + } + } PendingAllocs.push_back( AllocInfo(std::move(Versions), std::move(MIBs))); - assert(HasTotalSizes != TotalSizes.empty()); - if (HasTotalSizes) { + if (!TotalSizes.empty()) { assert(PendingAllocs.back().MIBs.size() == TotalSizes.size()); - PendingAllocs.back().TotalSizes = - std::make_unique>(std::move(TotalSizes)); + PendingAllocs.back().TotalSizes = std::move(TotalSizes); } break; } diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index 4463eb48b9811..b3ebe70e8c52f 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -230,8 +230,7 @@ class ModuleBitcodeWriterBase : public BitcodeWriterBase { void writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsAbbrev, unsigned FSCallsProfileAbbrev, - unsigned CallsiteAbbrev, unsigned AllocAbbrev, - unsigned AllocTotalSizeAbbrev, const Function &F); + unsigned CallsiteAbbrev, unsigned AllocAbbrev, const Function &F); void writeModuleLevelReferences(const GlobalVariable &V, SmallVector &NameVals, unsigned FSModRefsAbbrev, @@ -4159,7 +4158,7 @@ static void writeTypeIdCompatibleVtableSummaryRecord( static void writeFunctionHeapProfileRecords( BitstreamWriter &Stream, FunctionSummary *FS, unsigned CallsiteAbbrev, - unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, bool PerModule, + unsigned AllocAbbrev, bool PerModule, std::function GetValueID, std::function GetStackIndex) { SmallVector Record; @@ -4190,35 +4189,27 @@ static void writeFunctionHeapProfileRecords( // Per module alloc versions should always have a single entry of // value 0. assert(!PerModule || (AI.Versions.size() == 1 && AI.Versions[0] == 0)); - if (!PerModule) { - Record.push_back(AI.MIBs.size()); + Record.push_back(AI.MIBs.size()); + if (!PerModule) Record.push_back(AI.Versions.size()); - } - unsigned I = 0; - assert(!AI.TotalSizes || AI.TotalSizes->size() == AI.MIBs.size()); for (auto &MIB : AI.MIBs) { Record.push_back((uint8_t)MIB.AllocType); - if (AI.TotalSizes) { - Record.push_back((*AI.TotalSizes)[I]); - assert((*AI.TotalSizes)[I]); - } Record.push_back(MIB.StackIdIndices.size()); for (auto Id : MIB.StackIdIndices) Record.push_back(GetStackIndex(Id)); - I++; } if (!PerModule) { for (auto V : AI.Versions) Record.push_back(V); } - unsigned Code = - PerModule ? (AI.TotalSizes ? bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES - : bitc::FS_PERMODULE_ALLOC_INFO) - : (AI.TotalSizes ? bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES - : bitc::FS_COMBINED_ALLOC_INFO); - unsigned AllocAbbrevToUse = - AI.TotalSizes ? AllocTotalSizeAbbrev : AllocAbbrev; - Stream.EmitRecord(Code, Record, AllocAbbrevToUse); + assert(AI.TotalSizes.empty() || AI.TotalSizes.size() == AI.MIBs.size()); + if (!AI.TotalSizes.empty()) { + for (auto Size : AI.TotalSizes) + Record.push_back(Size); + } + Stream.EmitRecord(PerModule ? bitc::FS_PERMODULE_ALLOC_INFO + : bitc::FS_COMBINED_ALLOC_INFO, + Record, AllocAbbrev); } } @@ -4227,7 +4218,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( SmallVector &NameVals, GlobalValueSummary *Summary, unsigned ValueID, unsigned FSCallsRelBFAbbrev, unsigned FSCallsProfileAbbrev, unsigned CallsiteAbbrev, - unsigned AllocAbbrev, unsigned AllocTotalSizeAbbrev, const Function &F) { + unsigned AllocAbbrev, const Function &F) { NameVals.push_back(ValueID); FunctionSummary *FS = cast(Summary); @@ -4238,7 +4229,7 @@ void ModuleBitcodeWriterBase::writePerModuleFunctionSummaryRecord( }); writeFunctionHeapProfileRecords( - Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev, + Stream, FS, CallsiteAbbrev, AllocAbbrev, /*PerModule*/ true, /*GetValueId*/ [&](const ValueInfo &VI) { return getValueId(VI); }, /*GetStackIndex*/ [&](unsigned I) { return I; }); @@ -4445,18 +4436,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { Abbv = std::make_shared(); Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO)); + Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib // n x (alloc type, numstackids, numstackids x stackidindex) + // optional: nummib x total size Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); - Abbv = std::make_shared(); - Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_ALLOC_INFO_TOTAL_SIZES)); - // n x (alloc type, total size, numstackids, numstackids x stackidindex) - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv)); - SmallVector NameVals; // Iterate over the list of functions instead of the Index to // ensure the ordering is stable. @@ -4474,10 +4460,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() { continue; } auto *Summary = VI.getSummaryList()[0].get(); - writePerModuleFunctionSummaryRecord(NameVals, Summary, VE.getValueID(&F), - FSCallsRelBFAbbrev, - FSCallsProfileAbbrev, CallsiteAbbrev, - AllocAbbrev, AllocTotalSizeAbbrev, F); + writePerModuleFunctionSummaryRecord( + NameVals, Summary, VE.getValueID(&F), FSCallsRelBFAbbrev, + FSCallsProfileAbbrev, CallsiteAbbrev, AllocAbbrev, F); } // Capture references from GlobalVariable initializers, which are outside @@ -4597,6 +4582,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver // nummib x (alloc type, numstackids, numstackids x stackidindex), // numver x version + // optional: nummib x total size Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); unsigned AllocAbbrev = Stream.EmitAbbrev(std::move(Abbv)); @@ -4607,16 +4593,6 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return DecSummaries->count(GVS); }; - Abbv = std::make_shared(); - Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALLOC_INFO_TOTAL_SIZES)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // nummib - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numver - // nummib x (alloc type, total size, numstackids, numstackids x stackidindex), - // numver x version - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array)); - Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); - unsigned AllocTotalSizeAbbrev = Stream.EmitAbbrev(std::move(Abbv)); - // The aliases are emitted as a post-pass, and will point to the value // id of the aliasee. Save them in a vector for post-processing. SmallVector Aliases; @@ -4704,9 +4680,10 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { getReferencedTypeIds(FS, ReferencedTypeIds); writeFunctionHeapProfileRecords( - Stream, FS, CallsiteAbbrev, AllocAbbrev, AllocTotalSizeAbbrev, + Stream, FS, CallsiteAbbrev, AllocAbbrev, /*PerModule*/ false, - /*GetValueId*/ [&](const ValueInfo &VI) -> unsigned { + /*GetValueId*/ + [&](const ValueInfo &VI) -> unsigned { std::optional ValueID = GetValueId(VI); // This can happen in shared index files for distributed ThinLTO if // the callee function summary is not included. Record 0 which we @@ -4716,7 +4693,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() { return 0; return *ValueID; }, - /*GetStackIndex*/ [&](unsigned I) { + /*GetStackIndex*/ + [&](unsigned I) { // Get the corresponding index into the list of StackIds actually // being written for this combined index (which may be a subset in // the case of distributed indexes). diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 01138555a8fbc..ef9ddeaaab632 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -1766,14 +1766,14 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph( EmptyContext; unsigned I = 0; assert(!MemProfReportHintedSizes || - (AN.TotalSizes && AN.TotalSizes->size() == AN.MIBs.size())); + AN.TotalSizes.size() == AN.MIBs.size()); // Now add all of the MIBs and their stack nodes. for (auto &MIB : AN.MIBs) { CallStack::const_iterator> StackContext(&MIB); uint64_t TotalSize = 0; if (MemProfReportHintedSizes) - TotalSize = (*AN.TotalSizes)[I]; + TotalSize = AN.TotalSizes[I]; addStackNodesForMIB::const_iterator>( AllocNode, StackContext, EmptyContext, MIB.AllocType, TotalSize); diff --git a/llvm/test/Bitcode/summary_version.ll b/llvm/test/Bitcode/summary_version.ll index 98feab6fe2f99..26c64f81a773f 100644 --- a/llvm/test/Bitcode/summary_version.ll +++ b/llvm/test/Bitcode/summary_version.ll @@ -2,7 +2,7 @@ ; RUN: opt -module-summary %s -o - | llvm-bcanalyzer -dump | FileCheck %s ; CHECK: +; CHECK: diff --git a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll index 19e228fd5355c..b3f1e770810d2 100644 --- a/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll +++ b/llvm/test/Bitcode/thinlto-func-summary-vtableref-pgo.ll @@ -11,7 +11,7 @@ ; RUN: llvm-dis -o - %t.o | llvm-as -o - | llvm-dis -o - | FileCheck %s --check-prefix=DIS ; CHECK: +; CHECK-NEXT: ; The `VALUE_GUID` below represents the "_ZTV4Base" referenced by the instruction ; that loads vtable pointers. From 3ad6a4ee65c6a76e227f2e38b081cf9f7ff362e0 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 11 Jul 2024 14:01:23 -0700 Subject: [PATCH 4/5] Fix typo in original comment --- llvm/include/llvm/Bitcode/LLVMBitCodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index 4e6ef4c1a8c31..184bbe32df695 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -307,7 +307,7 @@ enum GlobalValueSummarySymtabCodes { // [valueid, n x stackidindex] FS_PERMODULE_CALLSITE_INFO = 26, // Summary of per-module allocation memprof metadata. - // [nummib, nummib x (alloc type, nummib, nummib x stackidindex), + // [nummib, nummib x (alloc type, numstackids, numstackids x stackidindex), // [nummib x total size]?] FS_PERMODULE_ALLOC_INFO = 27, // Summary of combined index memprof callsite metadata. From 75a8eb9b07c0c243410861fc0e2adaa1e4a53269 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Thu, 11 Jul 2024 15:19:48 -0700 Subject: [PATCH 5/5] Address comments --- llvm/include/llvm/IR/ModuleSummaryIndex.h | 2 +- llvm/test/ThinLTO/X86/memprof-basic.ll | 2 -- llvm/test/Transforms/MemProfContextDisambiguation/basic.ll | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h index 949e60ffd1dbf..00934cc1ce6f2 100644 --- a/llvm/include/llvm/IR/ModuleSummaryIndex.h +++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h @@ -430,7 +430,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const AllocInfo &AE) { if (!AE.TotalSizes.empty()) { OS << " TotalSizes per MIB:\n\t\t"; First = true; - for (auto &TS : AE.TotalSizes) { + for (uint64_t TS : AE.TotalSizes) { if (!First) OS << ", "; First = false; diff --git a/llvm/test/ThinLTO/X86/memprof-basic.ll b/llvm/test/ThinLTO/X86/memprof-basic.ll index e080279a16412..6922dbfd36846 100644 --- a/llvm/test/ThinLTO/X86/memprof-basic.ll +++ b/llvm/test/ThinLTO/X86/memprof-basic.ll @@ -267,11 +267,9 @@ attributes #0 = { noinline optnone } ; DUMP: Edge from Callee [[BAR2]] to Caller: [[BAZ2]] AllocTypes: Cold ContextIds: 2 ; DUMP: Clone of [[BAR]] - ; SIZES: NotCold context 1 with total size 100 is NotCold after cloning ; SIZES: Cold context 2 with total size 400 is Cold after cloning - ; REMARKS: call in clone main assigned to call function clone _Z3foov.memprof.1 ; REMARKS: created clone _Z3barv.memprof.1 ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll index c685a413718a6..a82f872d51c7d 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/basic.ll @@ -249,11 +249,9 @@ attributes #6 = { builtin } ; REMARKS: call in clone _Z3bazv assigned to call function clone _Z3barv ; REMARKS: call in clone _Z3barv marked with memprof allocation attribute notcold - ; SIZES: NotCold context 1 with total size 100 is NotCold after cloning ; SIZES: Cold context 2 with total size 400 is Cold after cloning - ; IR: define {{.*}} @main ;; The first call to foo does not allocate cold memory. It should call the ;; original functions, which ultimately call the original allocation decorated