Skip to content

Commit dc3f8c2

Browse files
[memprof] Improve deserialization performance in V3 (#94787)
We call llvm::sort in a couple of places in the V3 encoding: - We sort Frames by FrameIds for stability of the output. - We sort call stacks in the dictionary order to maximize the length of the common prefix between adjacent call stacks. It turns out that we can improve the deserialization performance by modifying the comparison functions -- without changing the format at all. Both places take advantage of the histogram of Frames -- how many times each Frame occurs in the call stacks. - Frames: We serialize popular Frames in the descending order of popularity for improved cache locality. For two equally popular Frames, we break a tie by serializing one that tends to appear earlier in call stacks. Here, "earlier" means a smaller index within llvm::SmallVector<FrameId>. - Call Stacks: We sort the call stacks to reduce the number of times we follow pointers to parents during deserialization. Specifically, instead of comparing two call stacks in the strcmp style -- integer comparisons of FrameIds, we compare two FrameIds F1 and F2 with Histogram[F1] < Histogram[F2] at respective indexes. Since we encode from the end of the sorted list of call stacks, we tend to encode popular call stacks first. Since the two places use the same histogram, we compute it once and share it in the two places. Sorting the call stacks reduces the number of "jumps" by 74% when we deserialize all MemProfRecords. The cycle and instruction counts go down by 10% and 1.5%, respectively. If we sort the Frames in addition to the call stacks, then the cycle and instruction counts go down by 14% and 1.6%, respectively, relative to the same baseline (that is, without this patch).
1 parent 435dd97 commit dc3f8c2

File tree

4 files changed

+136
-18
lines changed

4 files changed

+136
-18
lines changed

llvm/include/llvm/ProfileData/MemProf.h

+17-3
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,18 @@ struct IndexedMemProfData {
932932
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> CallStackData;
933933
};
934934

935+
struct FrameStat {
936+
// The number of occurrences of a given FrameId.
937+
uint64_t Count = 0;
938+
// The sum of indexes where a given FrameId shows up.
939+
uint64_t PositionSum = 0;
940+
};
941+
942+
// Compute a histogram of Frames in call stacks.
943+
llvm::DenseMap<FrameId, FrameStat>
944+
computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
945+
&MemProfCallStackData);
946+
935947
// Construct a radix tree of call stacks.
936948
//
937949
// A set of call stacks might look like:
@@ -1027,9 +1039,11 @@ class CallStackRadixTreeBuilder {
10271039
CallStackRadixTreeBuilder() = default;
10281040

10291041
// Build a radix tree array.
1030-
void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
1031-
&&MemProfCallStackData,
1032-
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
1042+
void
1043+
build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
1044+
&&MemProfCallStackData,
1045+
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
1046+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram);
10331047

10341048
const std::vector<LinearFrameId> &getRadixArray() const { return RadixArray; }
10351049

llvm/lib/ProfileData/InstrProfWriter.cpp

+36-7
Original file line numberDiff line numberDiff line change
@@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames(
494494
static llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
495495
writeMemProfFrameArray(
496496
ProfOStream &OS,
497-
llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
497+
llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
498+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
498499
// Mappings from FrameIds to array indexes.
499500
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes;
500501

501-
// Sort the FrameIDs for stability.
502+
// Compute the order in which we serialize Frames. The order does not matter
503+
// in terms of correctness, but we still compute it for deserialization
504+
// performance. Specifically, if we serialize frequently used Frames one
505+
// after another, we have better cache utilization. For two Frames that
506+
// appear equally frequently, we break a tie by serializing the one that tends
507+
// to appear earlier in call stacks. We implement the tie-breaking mechanism
508+
// by computing the sum of indexes within call stacks for each Frame. If we
509+
// still have a tie, then we just resort to compare two FrameIds, which is
510+
// just for stability of output.
502511
std::vector<std::pair<memprof::FrameId, const memprof::Frame *>> FrameIdOrder;
503512
FrameIdOrder.reserve(MemProfFrameData.size());
504513
for (const auto &[Id, Frame] : MemProfFrameData)
505514
FrameIdOrder.emplace_back(Id, &Frame);
506515
assert(MemProfFrameData.size() == FrameIdOrder.size());
507-
llvm::sort(FrameIdOrder);
516+
llvm::sort(FrameIdOrder,
517+
[&](const std::pair<memprof::FrameId, const memprof::Frame *> &L,
518+
const std::pair<memprof::FrameId, const memprof::Frame *> &R) {
519+
const auto &SL = FrameHistogram[L.first];
520+
const auto &SR = FrameHistogram[R.first];
521+
// Popular FrameIds should come first.
522+
if (SL.Count != SR.Count)
523+
return SL.Count > SR.Count;
524+
// If they are equally popular, then the one that tends to appear
525+
// earlier in call stacks should come first.
526+
if (SL.PositionSum != SR.PositionSum)
527+
return SL.PositionSum < SR.PositionSum;
528+
// Compare their FrameIds for sort stability.
529+
return L.first < R.first;
530+
});
508531

509532
// Serialize all frames while creating mappings from linear IDs to FrameIds.
510533
uint64_t Index = 0;
@@ -543,12 +566,14 @@ writeMemProfCallStackArray(
543566
llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
544567
&MemProfCallStackData,
545568
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
546-
&MemProfFrameIndexes) {
569+
&MemProfFrameIndexes,
570+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
547571
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
548572
MemProfCallStackIndexes;
549573

550574
memprof::CallStackRadixTreeBuilder Builder;
551-
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
575+
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
576+
FrameHistogram);
552577
for (auto I : Builder.getRadixArray())
553578
OS.write32(I);
554579
MemProfCallStackIndexes = Builder.takeCallStackPos();
@@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS,
704729
Schema = memprof::getFullSchema();
705730
writeMemProfSchema(OS, Schema);
706731

732+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> FrameHistogram =
733+
memprof::computeFrameHistogram(MemProfData.CallStackData);
734+
assert(MemProfData.FrameData.size() == FrameHistogram.size());
735+
707736
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes =
708-
writeMemProfFrameArray(OS, MemProfData.FrameData);
737+
writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram);
709738

710739
uint64_t CallStackPayloadOffset = OS.tell();
711740
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
712741
MemProfCallStackIndexes = writeMemProfCallStackArray(
713-
OS, MemProfData.CallStackData, MemProfFrameIndexes);
742+
OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram);
714743

715744
uint64_t RecordPayloadOffset = OS.tell();
716745
uint64_t RecordTableOffset =

llvm/lib/ProfileData/MemProf.cpp

+63-4
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,8 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
486486
void CallStackRadixTreeBuilder::build(
487487
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
488488
&&MemProfCallStackData,
489-
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
489+
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
490+
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
490491
// Take the vector portion of MemProfCallStackData. The vector is exactly
491492
// what we need to sort. Also, we no longer need its lookup capability.
492493
llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
@@ -498,14 +499,56 @@ void CallStackRadixTreeBuilder::build(
498499
return;
499500
}
500501

501-
// Sort the list of call stacks in the dictionary order to maximize the length
502-
// of the common prefix between two adjacent call stacks.
502+
// Sorting the list of call stacks in the dictionary order is sufficient to
503+
// maximize the length of the common prefix between two adjacent call stacks
504+
// and thus minimize the length of RadixArray. However, we go one step
505+
// further and try to reduce the number of times we follow pointers to parents
506+
// during deserilization. Consider a poorly encoded radix tree:
507+
//
508+
// CallStackId 1: f1 -> f2 -> f3
509+
// |
510+
// CallStackId 2: +--- f4 -> f5
511+
// |
512+
// CallStackId 3: +--> f6
513+
//
514+
// Here, f2 and f4 appear once and twice, respectively, in the call stacks.
515+
// Once we encode CallStackId 1 into RadixArray, every other call stack with
516+
// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
517+
// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
518+
// parents twice.
519+
//
520+
// We try to alleviate the situation by sorting the list of call stacks by
521+
// comparing the popularity of frames rather than the integer values of
522+
// FrameIds. In the example above, f4 is more popular than f2, so we sort the
523+
// call stacks and encode them as:
524+
//
525+
// CallStackId 2: f1 -- f4 -> f5
526+
// | |
527+
// CallStackId 3: | +--> f6
528+
// |
529+
// CallStackId 1: +--> f2 -> f3
530+
//
531+
// Notice that CallStackId 3 follows a pointer to a parent only once.
532+
//
533+
// All this is a quick-n-dirty trick to reduce the number of jumps. The
534+
// proper way would be to compute the weight of each radix tree node -- how
535+
// many call stacks use a given radix tree node, and encode a radix tree from
536+
// the heaviest node first. We do not do so because that's a lot of work.
503537
llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
504538
// Call stacks are stored from leaf to root. Perform comparisons from the
505539
// root.
506540
return std::lexicographical_compare(
507541
L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
508-
[&](FrameId F1, FrameId F2) { return F1 < F2; });
542+
[&](FrameId F1, FrameId F2) {
543+
uint64_t H1 = FrameHistogram[F1].Count;
544+
uint64_t H2 = FrameHistogram[F2].Count;
545+
// Popular frames should come later because we encode call stacks from
546+
// the last one in the list.
547+
if (H1 != H2)
548+
return H1 < H2;
549+
// For sort stability.
550+
return F1 < F2;
551+
});
509552
});
510553

511554
// Reserve some reasonable amount of storage.
@@ -569,6 +612,22 @@ void CallStackRadixTreeBuilder::build(
569612
V = RadixArray.size() - 1 - V;
570613
}
571614

615+
llvm::DenseMap<FrameId, FrameStat>
616+
computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
617+
&MemProfCallStackData) {
618+
llvm::DenseMap<FrameId, FrameStat> Histogram;
619+
620+
for (const auto &KV : MemProfCallStackData) {
621+
const auto &CS = KV.second;
622+
for (unsigned I = 0, E = CS.size(); I != E; ++I) {
623+
auto &S = Histogram[CS[I]];
624+
++S.Count;
625+
S.PositionSum += I;
626+
}
627+
}
628+
return Histogram;
629+
}
630+
572631
void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
573632
for (const auto &AS : Record.AllocSites) {
574633
assert(AS.CSId == hashCallStack(AS.CallStack));

llvm/unittests/ProfileData/MemProfTest.cpp

+20-4
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,12 @@ TEST(MemProf, MissingFrameId) {
667667
TEST(MemProf, RadixTreeBuilderEmpty) {
668668
llvm::DenseMap<FrameId, llvm::memprof::LinearFrameId> MemProfFrameIndexes;
669669
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
670+
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
671+
FrameHistogram =
672+
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
670673
llvm::memprof::CallStackRadixTreeBuilder Builder;
671-
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
674+
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
675+
FrameHistogram);
672676
ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
673677
const auto Mappings = Builder.takeCallStackPos();
674678
ASSERT_THAT(Mappings, testing::IsEmpty());
@@ -681,8 +685,12 @@ TEST(MemProf, RadixTreeBuilderOne) {
681685
llvm::SmallVector<llvm::memprof::FrameId> CS1 = {13, 12, 11};
682686
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
683687
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
688+
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
689+
FrameHistogram =
690+
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
684691
llvm::memprof::CallStackRadixTreeBuilder Builder;
685-
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
692+
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
693+
FrameHistogram);
686694
EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
687695
3U, // Size of CS1,
688696
3U, // MemProfFrameIndexes[13]
@@ -704,8 +712,12 @@ TEST(MemProf, RadixTreeBuilderTwo) {
704712
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
705713
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
706714
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
715+
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
716+
FrameHistogram =
717+
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
707718
llvm::memprof::CallStackRadixTreeBuilder Builder;
708-
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
719+
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
720+
FrameHistogram);
709721
EXPECT_THAT(Builder.getRadixArray(),
710722
testing::ElementsAreArray({
711723
2U, // Size of CS1
@@ -738,8 +750,12 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
738750
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
739751
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS3), CS3});
740752
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
753+
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
754+
FrameHistogram =
755+
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
741756
llvm::memprof::CallStackRadixTreeBuilder Builder;
742-
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
757+
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
758+
FrameHistogram);
743759
EXPECT_THAT(Builder.getRadixArray(),
744760
testing::ElementsAreArray({
745761
4U, // Size of CS1

0 commit comments

Comments
 (0)