diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 0e6245acb77e8..b7f6e59b45a3e 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -932,6 +932,18 @@ struct IndexedMemProfData { llvm::MapVector> CallStackData; }; +struct FrameStat { + // The number of occurrences of a given FrameId. + uint64_t Count = 0; + // The sum of indexes where a given FrameId shows up. + uint64_t PositionSum = 0; +}; + +// Compute a histogram of Frames in call stacks. +llvm::DenseMap +computeFrameHistogram(llvm::MapVector> + &MemProfCallStackData); + // Construct a radix tree of call stacks. // // A set of call stacks might look like: @@ -1027,9 +1039,11 @@ class CallStackRadixTreeBuilder { CallStackRadixTreeBuilder() = default; // Build a radix tree array. - void build(llvm::MapVector> - &&MemProfCallStackData, - const llvm::DenseMap &MemProfFrameIndexes); + void + build(llvm::MapVector> + &&MemProfCallStackData, + const llvm::DenseMap &MemProfFrameIndexes, + llvm::DenseMap &FrameHistogram); const std::vector &getRadixArray() const { return RadixArray; } diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index a73f72a534f16..4d41b6b77559b 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames( static llvm::DenseMap writeMemProfFrameArray( ProfOStream &OS, - llvm::MapVector &MemProfFrameData) { + llvm::MapVector &MemProfFrameData, + llvm::DenseMap &FrameHistogram) { // Mappings from FrameIds to array indexes. llvm::DenseMap MemProfFrameIndexes; - // Sort the FrameIDs for stability. + // Compute the order in which we serialize Frames. The order does not matter + // in terms of correctness, but we still compute it for deserialization + // performance. Specifically, if we serialize frequently used Frames one + // after another, we have better cache utilization. For two Frames that + // appear equally frequently, we break a tie by serializing the one that tends + // to appear earlier in call stacks. We implement the tie-breaking mechanism + // by computing the sum of indexes within call stacks for each Frame. If we + // still have a tie, then we just resort to compare two FrameIds, which is + // just for stability of output. std::vector> FrameIdOrder; FrameIdOrder.reserve(MemProfFrameData.size()); for (const auto &[Id, Frame] : MemProfFrameData) FrameIdOrder.emplace_back(Id, &Frame); assert(MemProfFrameData.size() == FrameIdOrder.size()); - llvm::sort(FrameIdOrder); + llvm::sort(FrameIdOrder, + [&](const std::pair &L, + const std::pair &R) { + const auto &SL = FrameHistogram[L.first]; + const auto &SR = FrameHistogram[R.first]; + // Popular FrameIds should come first. + if (SL.Count != SR.Count) + return SL.Count > SR.Count; + // If they are equally popular, then the one that tends to appear + // earlier in call stacks should come first. + if (SL.PositionSum != SR.PositionSum) + return SL.PositionSum < SR.PositionSum; + // Compare their FrameIds for sort stability. + return L.first < R.first; + }); // Serialize all frames while creating mappings from linear IDs to FrameIds. uint64_t Index = 0; @@ -543,12 +566,14 @@ writeMemProfCallStackArray( llvm::MapVector> &MemProfCallStackData, llvm::DenseMap - &MemProfFrameIndexes) { + &MemProfFrameIndexes, + llvm::DenseMap &FrameHistogram) { llvm::DenseMap MemProfCallStackIndexes; memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes); + Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + FrameHistogram); for (auto I : Builder.getRadixArray()) OS.write32(I); MemProfCallStackIndexes = Builder.takeCallStackPos(); @@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS, Schema = memprof::getFullSchema(); writeMemProfSchema(OS, Schema); + llvm::DenseMap FrameHistogram = + memprof::computeFrameHistogram(MemProfData.CallStackData); + assert(MemProfData.FrameData.size() == FrameHistogram.size()); + llvm::DenseMap MemProfFrameIndexes = - writeMemProfFrameArray(OS, MemProfData.FrameData); + writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram); uint64_t CallStackPayloadOffset = OS.tell(); llvm::DenseMap MemProfCallStackIndexes = writeMemProfCallStackArray( - OS, MemProfData.CallStackData, MemProfFrameIndexes); + OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram); uint64_t RecordPayloadOffset = OS.tell(); uint64_t RecordTableOffset = diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp index aecac24169526..9111ffd1ce786 100644 --- a/llvm/lib/ProfileData/MemProf.cpp +++ b/llvm/lib/ProfileData/MemProf.cpp @@ -485,7 +485,8 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack( void CallStackRadixTreeBuilder::build( llvm::MapVector> &&MemProfCallStackData, - const llvm::DenseMap &MemProfFrameIndexes) { + const llvm::DenseMap &MemProfFrameIndexes, + llvm::DenseMap &FrameHistogram) { // Take the vector portion of MemProfCallStackData. The vector is exactly // what we need to sort. Also, we no longer need its lookup capability. llvm::SmallVector CallStacks = MemProfCallStackData.takeVector(); @@ -497,14 +498,56 @@ void CallStackRadixTreeBuilder::build( return; } - // Sort the list of call stacks in the dictionary order to maximize the length - // of the common prefix between two adjacent call stacks. + // Sorting the list of call stacks in the dictionary order is sufficient to + // maximize the length of the common prefix between two adjacent call stacks + // and thus minimize the length of RadixArray. However, we go one step + // further and try to reduce the number of times we follow pointers to parents + // during deserilization. Consider a poorly encoded radix tree: + // + // CallStackId 1: f1 -> f2 -> f3 + // | + // CallStackId 2: +--- f4 -> f5 + // | + // CallStackId 3: +--> f6 + // + // Here, f2 and f4 appear once and twice, respectively, in the call stacks. + // Once we encode CallStackId 1 into RadixArray, every other call stack with + // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 + // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to + // parents twice. + // + // We try to alleviate the situation by sorting the list of call stacks by + // comparing the popularity of frames rather than the integer values of + // FrameIds. In the example above, f4 is more popular than f2, so we sort the + // call stacks and encode them as: + // + // CallStackId 2: f1 -- f4 -> f5 + // | | + // CallStackId 3: | +--> f6 + // | + // CallStackId 1: +--> f2 -> f3 + // + // Notice that CallStackId 3 follows a pointer to a parent only once. + // + // All this is a quick-n-dirty trick to reduce the number of jumps. The + // proper way would be to compute the weight of each radix tree node -- how + // many call stacks use a given radix tree node, and encode a radix tree from + // the heaviest node first. We do not do so because that's a lot of work. llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { // Call stacks are stored from leaf to root. Perform comparisons from the // root. return std::lexicographical_compare( L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), - [&](FrameId F1, FrameId F2) { return F1 < F2; }); + [&](FrameId F1, FrameId F2) { + uint64_t H1 = FrameHistogram[F1].Count; + uint64_t H2 = FrameHistogram[F2].Count; + // Popular frames should come later because we encode call stacks from + // the last one in the list. + if (H1 != H2) + return H1 < H2; + // For sort stability. + return F1 < F2; + }); }); // Reserve some reasonable amount of storage. @@ -568,6 +611,22 @@ void CallStackRadixTreeBuilder::build( V = RadixArray.size() - 1 - V; } +llvm::DenseMap +computeFrameHistogram(llvm::MapVector> + &MemProfCallStackData) { + llvm::DenseMap Histogram; + + for (const auto &KV : MemProfCallStackData) { + const auto &CS = KV.second; + for (unsigned I = 0, E = CS.size(); I != E; ++I) { + auto &S = Histogram[CS[I]]; + ++S.Count; + S.PositionSum += I; + } + } + return Histogram; +} + void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) { for (const auto &AS : Record.AllocSites) { assert(AS.CSId == hashCallStack(AS.CallStack)); diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp index 26421200e1a11..15eb59ee00c94 100644 --- a/llvm/unittests/ProfileData/MemProfTest.cpp +++ b/llvm/unittests/ProfileData/MemProfTest.cpp @@ -667,8 +667,12 @@ TEST(MemProf, MissingFrameId) { TEST(MemProf, RadixTreeBuilderEmpty) { llvm::DenseMap MemProfFrameIndexes; llvm::MapVector> MemProfCallStackData; + llvm::DenseMap + FrameHistogram = + llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes); + Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + FrameHistogram); ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty()); const auto Mappings = Builder.takeCallStackPos(); ASSERT_THAT(Mappings, testing::IsEmpty()); @@ -681,8 +685,12 @@ TEST(MemProf, RadixTreeBuilderOne) { llvm::SmallVector CS1 = {13, 12, 11}; llvm::MapVector> MemProfCallStackData; MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1}); + llvm::DenseMap + FrameHistogram = + llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes); + Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ 3U, // Size of CS1, 3U, // MemProfFrameIndexes[13] @@ -704,8 +712,12 @@ TEST(MemProf, RadixTreeBuilderTwo) { llvm::MapVector> MemProfCallStackData; MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1}); MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2}); + llvm::DenseMap + FrameHistogram = + llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes); + Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ 2U, // Size of CS1 @@ -738,8 +750,12 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) { MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2}); MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS3), CS3}); MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4}); + llvm::DenseMap + FrameHistogram = + llvm::memprof::computeFrameHistogram(MemProfCallStackData); llvm::memprof::CallStackRadixTreeBuilder Builder; - Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes); + Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes, + FrameHistogram); EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({ 4U, // Size of CS1