Skip to content

Commit 75161d4

Browse files
committed
[compiler-rt][ctx_profile] Add the instrumented contextual profiling APIs
APIs for contextual profiling. (Tracking Issue: llvm#89287, RFC referenced there)
1 parent 4e9decf commit 75161d4

File tree

3 files changed

+483
-0
lines changed

3 files changed

+483
-0
lines changed

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,76 @@
1313
#include "sanitizer_common/sanitizer_mutex.h"
1414
#include "sanitizer_common/sanitizer_placement_new.h"
1515
#include "sanitizer_common/sanitizer_thread_safety.h"
16+
#include "sanitizer_common/sanitizer_vector.h"
1617

1718
#include <assert.h>
1819

1920
using namespace __ctx_profile;
2021

22+
namespace {
23+
__sanitizer::SpinMutex AllContextsMutex;
24+
SANITIZER_GUARDED_BY(AllContextsMutex)
25+
__sanitizer::Vector<ContextRoot *> AllContextRoots;
26+
27+
ContextNode *markAsScratch(const ContextNode *Ctx) {
28+
return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
29+
}
30+
31+
template <typename T> T consume(T &V) {
32+
auto R = V;
33+
V = {0};
34+
return R;
35+
}
36+
37+
constexpr size_t kPower = 20;
38+
constexpr size_t kBuffSize = 1 << kPower;
39+
40+
size_t getArenaAllocSize(size_t Needed) {
41+
if (Needed >= kBuffSize)
42+
return 2 * Needed;
43+
return kBuffSize;
44+
}
45+
46+
bool validate(const ContextRoot *Root) {
47+
__sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
48+
for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
49+
const auto *Pos = Mem->start();
50+
while (Pos < Mem->pos()) {
51+
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
52+
if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
53+
.second)
54+
return false;
55+
Pos += Ctx->size();
56+
}
57+
}
58+
59+
for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
60+
const auto *Pos = Mem->start();
61+
while (Pos < Mem->pos()) {
62+
const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
63+
for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
64+
for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
65+
if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
66+
return false;
67+
68+
Pos += Ctx->size();
69+
}
70+
}
71+
return true;
72+
}
73+
} // namespace
74+
75+
__thread char __Buffer[kBuffSize] = {0};
76+
77+
#define TheScratchContext \
78+
markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
79+
__thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
80+
nullptr};
81+
__thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
82+
83+
__thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
84+
nullptr;
85+
2186
// FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
2287
// the dependency on the latter.
2388
Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
@@ -38,3 +103,151 @@ void Arena::freeArenaList(Arena *&A) {
38103
}
39104
A = nullptr;
40105
}
106+
107+
inline ContextNode *ContextNode::alloc(char *Place, GUID Guid,
108+
uint32_t NrCounters,
109+
uint32_t NrCallsites,
110+
ContextNode *Next) {
111+
return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
112+
}
113+
114+
void ContextNode::reset() {
115+
for (uint32_t I = 0; I < NrCounters; ++I)
116+
counters()[I] = 0;
117+
for (uint32_t I = 0; I < NrCallsites; ++I)
118+
for (auto *Next = subContexts()[I]; Next; Next = Next->Next)
119+
Next->reset();
120+
}
121+
122+
ContextNode *getCallsiteSlow(uint64_t Guid, ContextNode **InsertionPoint,
123+
uint32_t NrCounters, uint32_t NrCallsites) {
124+
auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
125+
auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
126+
char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
127+
if (!AllocPlace) {
128+
__llvm_ctx_profile_current_context_root->CurrentMem = Mem =
129+
Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
130+
}
131+
auto *Ret = ContextNode::alloc(AllocPlace, Guid, NrCounters, NrCallsites,
132+
*InsertionPoint);
133+
*InsertionPoint = Ret;
134+
return Ret;
135+
}
136+
137+
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
138+
uint32_t NrCounters,
139+
uint32_t NrCallsites) {
140+
if (!__llvm_ctx_profile_current_context_root) {
141+
return TheScratchContext;
142+
}
143+
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
144+
if (!CallsiteContext || isScratch(*CallsiteContext))
145+
return TheScratchContext;
146+
147+
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
148+
if (ExpectedCallee != Callee)
149+
return TheScratchContext;
150+
151+
auto *Callsite = *CallsiteContext;
152+
while (Callsite && Callsite->guid() != Guid) {
153+
Callsite = Callsite->next();
154+
}
155+
auto *Ret = Callsite ? Callsite
156+
: getCallsiteSlow(Guid, CallsiteContext, NrCounters,
157+
NrCallsites);
158+
if (Ret->callsites_size() != NrCallsites ||
159+
Ret->counters_size() != NrCounters)
160+
__sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
161+
"Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
162+
Ret, Guid, NrCallsites, NrCounters, Ret->guid(),
163+
Ret->callsites_size(), Ret->counters_size());
164+
Ret->onEntry();
165+
return Ret;
166+
}
167+
168+
void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
169+
uint32_t NrCallsites) {
170+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
171+
&AllContextsMutex);
172+
// Re-check - we got here without having had taken a lock.
173+
if (Root->FirstMemBlock)
174+
return;
175+
const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
176+
auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
177+
Root->FirstMemBlock = M;
178+
Root->CurrentMem = M;
179+
Root->FirstNode = ContextNode::alloc(M->tryBumpAllocate(Needed), Guid,
180+
NrCounters, NrCallsites);
181+
AllContextRoots.PushBack(Root);
182+
}
183+
184+
ContextNode *__llvm_ctx_profile_start_context(
185+
ContextRoot *Root, GUID Guid, uint32_t Counters,
186+
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
187+
if (!Root->FirstMemBlock) {
188+
setupContext(Root, Guid, Counters, Callsites);
189+
}
190+
if (Root->Taken.TryLock()) {
191+
__llvm_ctx_profile_current_context_root = Root;
192+
Root->FirstNode->onEntry();
193+
return Root->FirstNode;
194+
}
195+
__llvm_ctx_profile_current_context_root = nullptr;
196+
return TheScratchContext;
197+
}
198+
199+
void __llvm_ctx_profile_release_context(ContextRoot *Root)
200+
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
201+
if (__llvm_ctx_profile_current_context_root) {
202+
__llvm_ctx_profile_current_context_root = nullptr;
203+
Root->Taken.Unlock();
204+
}
205+
}
206+
207+
void __llvm_ctx_profile_start_collection() {
208+
size_t NrMemUnits = 0;
209+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
210+
&AllContextsMutex);
211+
for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
212+
auto *Root = AllContextRoots[I];
213+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
214+
&Root->Taken);
215+
for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
216+
++NrMemUnits;
217+
218+
Root->FirstNode->reset();
219+
}
220+
__sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
221+
}
222+
223+
bool __llvm_ctx_profile_fetch(
224+
void *Data, bool (*Writer)(void *W, const __ctx_profile::ContextNode &)) {
225+
assert(Writer);
226+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
227+
&AllContextsMutex);
228+
229+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
230+
auto *Root = AllContextRoots[I];
231+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
232+
&Root->Taken);
233+
if (!validate(Root)) {
234+
__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
235+
return false;
236+
}
237+
if (!Writer(Data, *Root->FirstNode))
238+
return false;
239+
}
240+
return true;
241+
}
242+
243+
void __llvm_ctx_profile_free() {
244+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
245+
&AllContextsMutex);
246+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
247+
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
248+
auto *C = A;
249+
A = A->next();
250+
__sanitizer::InternalFree(C);
251+
}
252+
AllContextRoots.Reset();
253+
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
#ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
1010
#define CTX_PROFILE_CTXINSTRPROFILING_H_
1111

12+
#include "sanitizer_common/sanitizer_mutex.h"
1213
#include <sanitizer/common_interface_defs.h>
1314

1415
namespace __ctx_profile {
16+
using GUID = uint64_t;
1517

1618
/// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
1719
/// Allocation and de-allocation happen using sanitizer APIs. We make that
@@ -51,5 +53,119 @@ class Arena final {
5153
const uint64_t Size;
5254
};
5355

56+
class ContextNode final {
57+
const GUID Guid;
58+
ContextNode *const Next;
59+
const uint32_t NrCounters;
60+
const uint32_t NrCallsites;
61+
62+
public:
63+
ContextNode(GUID Guid, uint32_t NrCounters, uint32_t NrCallsites,
64+
ContextNode *Next = nullptr)
65+
: Guid(Guid), Next(Next), NrCounters(NrCounters),
66+
NrCallsites(NrCallsites) {}
67+
static inline ContextNode *alloc(char *Place, GUID Guid, uint32_t NrCounters,
68+
uint32_t NrCallsites,
69+
ContextNode *Next = nullptr);
70+
71+
static inline size_t getAllocSize(uint32_t NrCounters, uint32_t NrCallsites) {
72+
return sizeof(ContextNode) + sizeof(uint64_t) * NrCounters +
73+
sizeof(ContextNode *) * NrCallsites;
74+
}
75+
76+
uint64_t *counters() {
77+
ContextNode *addr_after = &(this[1]);
78+
return reinterpret_cast<uint64_t *>(reinterpret_cast<char *>(addr_after));
79+
}
80+
81+
uint32_t counters_size() const { return NrCounters; }
82+
uint32_t callsites_size() const { return NrCallsites; }
83+
84+
const uint64_t *counters() const {
85+
return const_cast<ContextNode *>(this)->counters();
86+
}
87+
88+
ContextNode **subContexts() {
89+
return reinterpret_cast<ContextNode **>(&(counters()[NrCounters]));
90+
}
91+
92+
ContextNode *const *subContexts() const {
93+
return const_cast<ContextNode *>(this)->subContexts();
94+
}
95+
96+
GUID guid() const { return Guid; }
97+
ContextNode *next() { return Next; }
98+
99+
size_t size() const { return getAllocSize(NrCounters, NrCallsites); }
100+
101+
void reset();
102+
103+
void onEntry() { ++counters()[0]; }
104+
105+
uint64_t entrycount() const { return counters()[0]; }
106+
};
107+
108+
/// ContextRoots are allocated by LLVM for entrypoints. The main concern is
109+
/// the total size, LLVM doesn't actually dereference members.
110+
struct ContextRoot {
111+
ContextNode *FirstNode = nullptr;
112+
Arena *FirstMemBlock = nullptr;
113+
Arena *CurrentMem = nullptr;
114+
// This is init-ed by the static zero initializer in LLVM.
115+
::__sanitizer::StaticSpinMutex Taken;
116+
117+
// Avoid surprises due to (unlikely) StaticSpinMutex changes.
118+
static_assert(sizeof(Taken) == 1);
119+
};
120+
121+
/// This API is exposed for testing.
122+
inline bool isScratch(const ContextNode *Ctx) {
123+
return (reinterpret_cast<uint64_t>(Ctx) & 1);
124+
}
125+
54126
} // namespace __ctx_profile
127+
128+
extern "C" {
129+
130+
// LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
131+
// position 0 is used when the current context isn't scratch, 1 when it is.
132+
extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
133+
extern __thread __ctx_profile::ContextNode *
134+
*volatile __llvm_ctx_profile_callsite[2];
135+
136+
// __llvm_ctx_profile_current_context_root is exposed for unit testing,
137+
// othwerise it's only used internally.
138+
extern __thread __ctx_profile::ContextRoot
139+
*volatile __llvm_ctx_profile_current_context_root;
140+
141+
/// called by LLVM in the entry BB of a "entry point" function. The returned
142+
/// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
143+
__ctx_profile::ContextNode *
144+
__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root,
145+
__ctx_profile::GUID Guid, uint32_t Counters,
146+
uint32_t Callsites);
147+
148+
/// paired with __llvm_ctx_profile_start_context, and called at the exit of the
149+
/// entry point function.
150+
void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
151+
152+
/// called for any other function than entry points, in the entry BB of such
153+
/// function. Same consideration about LSB of returned value as .._start_context
154+
__ctx_profile::ContextNode *
155+
__llvm_ctx_profile_get_context(void *Callee, __ctx_profile::GUID Guid,
156+
uint32_t NrCounters, uint32_t NrCallsites);
157+
158+
/// Prepares for collection. Currently this resets counter values but preserves
159+
/// internal structure.
160+
void __llvm_ctx_profile_start_collection();
161+
162+
/// Completely free allocated memory.
163+
void __llvm_ctx_profile_free();
164+
165+
/// Used to obtain the profile. The Writer is called for each root ContextNode,
166+
/// with the ContextRoot::Taken taken. The Writer is responsible for traversing
167+
/// the structure underneath.
168+
bool __llvm_ctx_profile_fetch(
169+
void *Data, bool (*Writer)(void *, const __ctx_profile::ContextNode &));
170+
}
55171
#endif // CTX_PROFILE_CTXINSTRPROFILING_H_

0 commit comments

Comments
 (0)