Skip to content

[memprof] Improve deserialization performance in V3 #94787

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions llvm/include/llvm/ProfileData/MemProf.h
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,18 @@ struct IndexedMemProfData {
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> CallStackData;
};

struct FrameStat {
// The number of occurrences of a given FrameId.
uint64_t Count = 0;
// The sum of indexes where a given FrameId shows up.
uint64_t PositionSum = 0;
};

// Compute a histogram of Frames in call stacks.
llvm::DenseMap<FrameId, FrameStat>
computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&MemProfCallStackData);

// Construct a radix tree of call stacks.
//
// A set of call stacks might look like:
Expand Down Expand Up @@ -1027,9 +1039,11 @@ class CallStackRadixTreeBuilder {
CallStackRadixTreeBuilder() = default;

// Build a radix tree array.
void build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&&MemProfCallStackData,
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes);
void
build(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&&MemProfCallStackData,
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram);

const std::vector<LinearFrameId> &getRadixArray() const { return RadixArray; }

Expand Down
43 changes: 36 additions & 7 deletions llvm/lib/ProfileData/InstrProfWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,17 +494,40 @@ static uint64_t writeMemProfFrames(
static llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
writeMemProfFrameArray(
ProfOStream &OS,
llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData) {
llvm::MapVector<memprof::FrameId, memprof::Frame> &MemProfFrameData,
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
// Mappings from FrameIds to array indexes.
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes;

// Sort the FrameIDs for stability.
// Compute the order in which we serialize Frames. The order does not matter
// in terms of correctness, but we still compute it for deserialization
// performance. Specifically, if we serialize frequently used Frames one
// after another, we have better cache utilization. For two Frames that
// appear equally frequently, we break a tie by serializing the one that tends
// to appear earlier in call stacks. We implement the tie-breaking mechanism
// by computing the sum of indexes within call stacks for each Frame. If we
// still have a tie, then we just resort to compare two FrameIds, which is
// just for stability of output.
std::vector<std::pair<memprof::FrameId, const memprof::Frame *>> FrameIdOrder;
FrameIdOrder.reserve(MemProfFrameData.size());
for (const auto &[Id, Frame] : MemProfFrameData)
FrameIdOrder.emplace_back(Id, &Frame);
assert(MemProfFrameData.size() == FrameIdOrder.size());
llvm::sort(FrameIdOrder);
llvm::sort(FrameIdOrder,
[&](const std::pair<memprof::FrameId, const memprof::Frame *> &L,
const std::pair<memprof::FrameId, const memprof::Frame *> &R) {
const auto &SL = FrameHistogram[L.first];
const auto &SR = FrameHistogram[R.first];
// Popular FrameIds should come first.
if (SL.Count != SR.Count)
return SL.Count > SR.Count;
// If they are equally popular, then the one that tends to appear
// earlier in call stacks should come first.
if (SL.PositionSum != SR.PositionSum)
return SL.PositionSum < SR.PositionSum;
// Compare their FrameIds for sort stability.
return L.first < R.first;
});

// Serialize all frames while creating mappings from linear IDs to FrameIds.
uint64_t Index = 0;
Expand Down Expand Up @@ -543,12 +566,14 @@ writeMemProfCallStackArray(
llvm::MapVector<memprof::CallStackId, llvm::SmallVector<memprof::FrameId>>
&MemProfCallStackData,
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
&MemProfFrameIndexes) {
&MemProfFrameIndexes,
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
MemProfCallStackIndexes;

memprof::CallStackRadixTreeBuilder Builder;
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
FrameHistogram);
for (auto I : Builder.getRadixArray())
OS.write32(I);
MemProfCallStackIndexes = Builder.takeCallStackPos();
Expand Down Expand Up @@ -704,13 +729,17 @@ static Error writeMemProfV3(ProfOStream &OS,
Schema = memprof::getFullSchema();
writeMemProfSchema(OS, Schema);

llvm::DenseMap<memprof::FrameId, memprof::FrameStat> FrameHistogram =
memprof::computeFrameHistogram(MemProfData.CallStackData);
assert(MemProfData.FrameData.size() == FrameHistogram.size());

llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId> MemProfFrameIndexes =
writeMemProfFrameArray(OS, MemProfData.FrameData);
writeMemProfFrameArray(OS, MemProfData.FrameData, FrameHistogram);

uint64_t CallStackPayloadOffset = OS.tell();
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
MemProfCallStackIndexes = writeMemProfCallStackArray(
OS, MemProfData.CallStackData, MemProfFrameIndexes);
OS, MemProfData.CallStackData, MemProfFrameIndexes, FrameHistogram);

uint64_t RecordPayloadOffset = OS.tell();
uint64_t RecordTableOffset =
Expand Down
67 changes: 63 additions & 4 deletions llvm/lib/ProfileData/MemProf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,8 @@ LinearCallStackId CallStackRadixTreeBuilder::encodeCallStack(
void CallStackRadixTreeBuilder::build(
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&&MemProfCallStackData,
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes) {
const llvm::DenseMap<FrameId, LinearFrameId> &MemProfFrameIndexes,
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
// Take the vector portion of MemProfCallStackData. The vector is exactly
// what we need to sort. Also, we no longer need its lookup capability.
llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
Expand All @@ -497,14 +498,56 @@ void CallStackRadixTreeBuilder::build(
return;
}

// Sort the list of call stacks in the dictionary order to maximize the length
// of the common prefix between two adjacent call stacks.
// Sorting the list of call stacks in the dictionary order is sufficient to
// maximize the length of the common prefix between two adjacent call stacks
// and thus minimize the length of RadixArray. However, we go one step
// further and try to reduce the number of times we follow pointers to parents
// during deserilization. Consider a poorly encoded radix tree:
//
// CallStackId 1: f1 -> f2 -> f3
// |
// CallStackId 2: +--- f4 -> f5
// |
// CallStackId 3: +--> f6
//
// Here, f2 and f4 appear once and twice, respectively, in the call stacks.
// Once we encode CallStackId 1 into RadixArray, every other call stack with
// common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3
// share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
// parents twice.
//
// We try to alleviate the situation by sorting the list of call stacks by
// comparing the popularity of frames rather than the integer values of
// FrameIds. In the example above, f4 is more popular than f2, so we sort the
// call stacks and encode them as:
//
// CallStackId 2: f1 -- f4 -> f5
// | |
// CallStackId 3: | +--> f6
// |
// CallStackId 1: +--> f2 -> f3
//
// Notice that CallStackId 3 follows a pointer to a parent only once.
//
// All this is a quick-n-dirty trick to reduce the number of jumps. The
// proper way would be to compute the weight of each radix tree node -- how
// many call stacks use a given radix tree node, and encode a radix tree from
// the heaviest node first. We do not do so because that's a lot of work.
llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
// Call stacks are stored from leaf to root. Perform comparisons from the
// root.
return std::lexicographical_compare(
L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
[&](FrameId F1, FrameId F2) { return F1 < F2; });
[&](FrameId F1, FrameId F2) {
uint64_t H1 = FrameHistogram[F1].Count;
uint64_t H2 = FrameHistogram[F2].Count;
// Popular frames should come later because we encode call stacks from
// the last one in the list.
if (H1 != H2)
return H1 < H2;
// For sort stability.
return F1 < F2;
});
});

// Reserve some reasonable amount of storage.
Expand Down Expand Up @@ -568,6 +611,22 @@ void CallStackRadixTreeBuilder::build(
V = RadixArray.size() - 1 - V;
}

llvm::DenseMap<FrameId, FrameStat>
computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
&MemProfCallStackData) {
llvm::DenseMap<FrameId, FrameStat> Histogram;

for (const auto &KV : MemProfCallStackData) {
const auto &CS = KV.second;
for (unsigned I = 0, E = CS.size(); I != E; ++I) {
auto &S = Histogram[CS[I]];
++S.Count;
S.PositionSum += I;
}
}
return Histogram;
}

void verifyIndexedMemProfRecord(const IndexedMemProfRecord &Record) {
for (const auto &AS : Record.AllocSites) {
assert(AS.CSId == hashCallStack(AS.CallStack));
Expand Down
24 changes: 20 additions & 4 deletions llvm/unittests/ProfileData/MemProfTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -667,8 +667,12 @@ TEST(MemProf, MissingFrameId) {
TEST(MemProf, RadixTreeBuilderEmpty) {
llvm::DenseMap<FrameId, llvm::memprof::LinearFrameId> MemProfFrameIndexes;
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
FrameHistogram =
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
llvm::memprof::CallStackRadixTreeBuilder Builder;
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
FrameHistogram);
ASSERT_THAT(Builder.getRadixArray(), testing::IsEmpty());
const auto Mappings = Builder.takeCallStackPos();
ASSERT_THAT(Mappings, testing::IsEmpty());
Expand All @@ -681,8 +685,12 @@ TEST(MemProf, RadixTreeBuilderOne) {
llvm::SmallVector<llvm::memprof::FrameId> CS1 = {13, 12, 11};
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
FrameHistogram =
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
llvm::memprof::CallStackRadixTreeBuilder Builder;
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
FrameHistogram);
EXPECT_THAT(Builder.getRadixArray(), testing::ElementsAreArray({
3U, // Size of CS1,
3U, // MemProfFrameIndexes[13]
Expand All @@ -704,8 +712,12 @@ TEST(MemProf, RadixTreeBuilderTwo) {
llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> MemProfCallStackData;
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS1), CS1});
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
FrameHistogram =
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
llvm::memprof::CallStackRadixTreeBuilder Builder;
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
FrameHistogram);
EXPECT_THAT(Builder.getRadixArray(),
testing::ElementsAreArray({
2U, // Size of CS1
Expand Down Expand Up @@ -738,8 +750,12 @@ TEST(MemProf, RadixTreeBuilderSuccessiveJumps) {
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS2), CS2});
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS3), CS3});
MemProfCallStackData.insert({llvm::memprof::hashCallStack(CS4), CS4});
llvm::DenseMap<llvm::memprof::FrameId, llvm::memprof::FrameStat>
FrameHistogram =
llvm::memprof::computeFrameHistogram(MemProfCallStackData);
llvm::memprof::CallStackRadixTreeBuilder Builder;
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes);
Builder.build(std::move(MemProfCallStackData), MemProfFrameIndexes,
FrameHistogram);
EXPECT_THAT(Builder.getRadixArray(),
testing::ElementsAreArray({
4U, // Size of CS1
Expand Down
Loading