Skip to content

Commit 7e9c2a3

Browse files
authored
Tracing: optimize and reduce overhead (#5958)
We've noticed some delays when tracing is enabled, and suspected it is due to lock contention or GC activity created by the tracing module itself. I've added some benchmarks to measure the overhead of tracing, and made some improvements (more improvements are possible). The benchmarks test both a situation with no workload, and a situation with 2 other threads: one running the same workload as the function under test, and another that runs a very GC intensive workload. Improved: * when max spans limit is hit: don't spam syslog. Time goes down from 202 *milli*seconds to 14 *micro*seconds per span * reduce lock contention: eliminate one global lock, replace with atomic ops * reduce allocation We are now at ~9µs/nested span on this particular machine, but further improvements are possible by delaying work to the exporter thread and finishing really quickly otherwise (e.g. upstream ocaml-trace claims ~60ns/span). When tracing is disabled the overhead is very small, just like before (on the order of 20ns without a workload). Before: ``` Running benchmarks (no workloads) ╭─────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├─────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ tracing/max span overflow │ 0.0000 mjw/run│ 1187.0000 mnw/run│ 96214066.4652 ns/run│ │ tracing/overhead(off) │ 0.0000 mjw/run│ 14.0000 mnw/run│ 19.0061 ns/run│ │ tracing/overhead(on, create span) │ 2.9250 mjw/run│ 496.3888 mnw/run│ 25431.5210 ns/run│ │ tracing/overhead(on, no span) │ 0.0000 mjw/run│ 14.0000 mnw/run│ 18.8063 ns/run│ ╰─────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ tracing/overhead(on, no span) (ns): { monotonic-clock per run = 18.806282 (confidence: 20.597860 to 16.986898); r² = Some 0.63699 } tracing/max span overflow (ns): { monotonic-clock per run = 96214066.465201 (confidence: 125336685.666667 to 70136316.380165); r² = Some 0.518256 } tracing/overhead(off) (ns): { monotonic-clock per run = 19.006133 (confidence: 20.619355 to 17.390492); r² = Some 0.667542 } tracing/overhead(on, create span) (ns): { monotonic-clock per run = 25431.521030 (confidence: 37607.254799 to 15151.227932); r² = Some 0.0329005 } Running benchmarks (workloads) ╭─────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├─────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ tracing/max span overflow │ 80934.7363 mjw/run│ 10580588.4725 mnw/run│ 202271848.5714 ns/run│ │ tracing/overhead(off) │ 0.0000 mjw/run│ 16.5479 mnw/run│ 212.0958 ns/run│ │ tracing/overhead(on, create span) │ 0.0000 mjw/run│ 503.4400 mnw/run│ 15633.2400 ns/run│ │ tracing/overhead(on, no span) │ 0.0000 mjw/run│ 18.3256 mnw/run│ 326.9568 ns/run│ ╰─────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ tracing/overhead(on, no span) (ns): { monotonic-clock per run = 326.956811; r² = Some -9.40099 } tracing/max span overflow (ns): { monotonic-clock per run = 202271848.571429; r² = Some 0.065489 } tracing/overhead(off) (ns): { monotonic-clock per run = 212.095829; r² = Some -4.46794 } tracing/overhead(on, create span) (ns): { monotonic-clock per run = 15633.240000; r² = Some 0.805726 } ``` After: ``` Running benchmarks (no workloads) ╭─────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├─────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ tracing/max span overflow │ 0.0964 mjw/run│ 379.0196 mnw/run│ 14479.0371 ns/run│ │ tracing/overhead(off) │ 0.0000 mjw/run│ 14.0000 mnw/run│ 18.2174 ns/run│ │ tracing/overhead(on, create span) │ 0.0000 mjw/run│ 397.3440 mnw/run│ 14712.1307 ns/run│ │ tracing/overhead(on, no span) │ 0.0000 mjw/run│ 14.0000 mnw/run│ 18.1771 ns/run│ ╰─────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ tracing/overhead(on, no span) (ns): { monotonic-clock per run = 18.177093 (confidence: 19.990180 to 16.374104); r² = Some 0.600249 } tracing/max span overflow (ns): { monotonic-clock per run = 14479.037101 (confidence: 15625.053708 to 13293.667720); r² = Some 0.635766 } tracing/overhead(off) (ns): { monotonic-clock per run = 18.217390 (confidence: 19.845841 to 16.442527); r² = Some 0.642373 } tracing/overhead(on, create span) (ns): { monotonic-clock per run = 14712.130670 (confidence: 23416.444172 to 7426.891454); r² = Some 0.0476263 } Running benchmarks (workloads) ╭─────────────────────────────────────┬───────────────────────────┬───────────────────────────┬───────────────────────────╮ │name │ major-allocated │ minor-allocated │ monotonic-clock │ ├─────────────────────────────────────┼───────────────────────────┼───────────────────────────┼───────────────────────────┤ │ tracing/max span overflow │ 0.0000 mjw/run│ 380.4429 mnw/run│ 7658.2207 ns/run│ │ tracing/overhead(off) │ 0.0000 mjw/run│ 15.3007 mnw/run│ 102.2479 ns/run│ │ tracing/overhead(on, create span) │ 0.0000 mjw/run│ 400.5094 mnw/run│ 8657.5585 ns/run│ │ tracing/overhead(on, no span) │ 0.0000 mjw/run│ 18.1333 mnw/run│ 373.9794 ns/run│ ╰─────────────────────────────────────┴───────────────────────────┴───────────────────────────┴───────────────────────────╯ tracing/overhead(on, no span) (ns): { monotonic-clock per run = 373.979447 (confidence: 435.400802 to 336.056569); r² = Some -1.84338 } tracing/max span overflow (ns): { monotonic-clock per run = 7658.220695 (confidence: 7952.878804 to 7396.117711); r² = Some 0.950954 } tracing/overhead(off) (ns): { monotonic-clock per run = 102.247932 (confidence: 119.364768 to 89.830417); r² = Some -0.607146 } tracing/overhead(on, create span) (ns): { monotonic-clock per run = 8657.558458 (confidence: 8904.596470 to 8435.216348); r² = Some 0.956299 } ``` Draft PR, this was only unit tested so far.
2 parents 33f20a9 + 959ebf6 commit 7e9c2a3

File tree

8 files changed

+566
-222
lines changed

8 files changed

+566
-222
lines changed

ocaml/libs/tracing/tracing.ml

Lines changed: 221 additions & 189 deletions
Large diffs are not rendered by default.

ocaml/libs/tracing/tracing.mli

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,40 @@ module SpanEvent : sig
5454
type t = {name: string; time: float; attributes: string Attributes.t}
5555
end
5656

57-
module SpanContext : sig
57+
module Span_id : sig
5858
type t
5959

60-
val context : string -> string -> t
60+
val make : unit -> t
61+
62+
val compare : t -> t -> int
63+
64+
val of_string : string -> t
65+
66+
val to_string : t -> string
67+
end
68+
69+
module Trace_id : sig
70+
type t
71+
72+
val make : unit -> t
73+
74+
val compare : t -> t -> int
75+
76+
val of_string : string -> t
77+
78+
val to_string : t -> string
79+
end
80+
81+
module SpanContext : sig
82+
type t
6183

6284
val to_traceparent : t -> string
6385

6486
val of_traceparent : string -> t option
6587

66-
val trace_id_of_span_context : t -> string
88+
val trace_id_of_span_context : t -> Trace_id.t
6789

68-
val span_id_of_span_context : t -> string
90+
val span_id_of_span_context : t -> Span_id.t
6991
end
7092

7193
module Span : sig
@@ -98,17 +120,20 @@ module Span : sig
98120
val get_attributes : t -> (string * string) list
99121
end
100122

123+
module TraceMap : module type of Map.Make (Trace_id)
124+
125+
module SpanMap : module type of Map.Make (Span_id)
126+
101127
module Spans : sig
102128
val set_max_spans : int -> unit
103129

104130
val set_max_traces : int -> unit
105131

106132
val span_count : unit -> int
107133

108-
val since : unit -> (string, Span.t list) Hashtbl.t
134+
val since : unit -> Span.t list * int
109135

110-
val dump :
111-
unit -> (string, Span.t list) Hashtbl.t * (string, Span.t list) Hashtbl.t
136+
val dump : unit -> Span.t SpanMap.t TraceMap.t * (Span.t list * int)
112137
end
113138

114139
module Tracer : sig
@@ -140,8 +165,6 @@ module Tracer : sig
140165
val finish :
141166
?error:exn * string -> Span.t option -> (Span.t option, exn) result
142167

143-
val span_is_finished : Span.t option -> bool
144-
145168
val span_hashtbl_is_empty : unit -> bool
146169

147170
val finished_span_hashtbl_is_empty : unit -> bool

ocaml/libs/tracing/tracing_export.ml

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,24 @@ module Content = struct
8383
)
8484
in
8585
{
86-
id= s |> Span.get_context |> SpanContext.span_id_of_span_context
87-
; traceId= s |> Span.get_context |> SpanContext.trace_id_of_span_context
86+
id=
87+
s
88+
|> Span.get_context
89+
|> SpanContext.span_id_of_span_context
90+
|> Span_id.to_string
91+
; traceId=
92+
s
93+
|> Span.get_context
94+
|> SpanContext.trace_id_of_span_context
95+
|> Trace_id.to_string
8896
; parentId=
8997
s
9098
|> Span.get_parent
9199
|> Option.map (fun x ->
92-
x |> Span.get_context |> SpanContext.span_id_of_span_context
100+
x
101+
|> Span.get_context
102+
|> SpanContext.span_id_of_span_context
103+
|> Span_id.to_string
93104
)
94105
; name= s |> Span.get_name
95106
; timestamp= int_of_float (Span.get_begin_time s *. 1000000.)
@@ -248,19 +259,15 @@ module Destination = struct
248259
| Bugtool ->
249260
(file_export, "Tracing.File.export")
250261
in
251-
let all_spans =
252-
Hashtbl.fold (fun _ spans acc -> spans @ acc) traces []
253-
in
262+
let all_spans, count = traces in
254263
let attributes =
255264
[
256265
("export.span.count", all_spans |> List.length |> string_of_int)
257266
; ("export.endpoint", endpoint_to_string endpoint)
258267
; ( "xs.tracing.spans_table.count"
259268
, Spans.span_count () |> string_of_int
260269
)
261-
; ( "xs.tracing.finished_spans_table.count"
262-
, traces |> Hashtbl.length |> string_of_int
263-
)
270+
; ("xs.tracing.finished_spans_table.count", string_of_int count)
264271
]
265272
in
266273
let@ _ = with_tracing ~parent ~attributes ~name in
@@ -273,17 +280,15 @@ module Destination = struct
273280
debug "Tracing: unable to export span : %s" (Printexc.to_string exn)
274281

275282
let flush_spans () =
276-
let span_list = Spans.since () in
277-
let attributes =
278-
[("export.traces.count", Hashtbl.length span_list |> string_of_int)]
279-
in
283+
let ((_span_list, span_count) as span_info) = Spans.since () in
284+
let attributes = [("export.traces.count", string_of_int span_count)] in
280285
let@ parent =
281286
with_tracing ~parent:None ~attributes ~name:"Tracing.flush_spans"
282287
in
283288
TracerProvider.get_tracer_providers ()
284289
|> List.filter TracerProvider.get_enabled
285290
|> List.concat_map TracerProvider.get_endpoints
286-
|> List.iter (export_to_endpoint parent span_list)
291+
|> List.iter (export_to_endpoint parent span_info)
287292

288293
let delay = Delay.make ()
289294

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
open Bechamel
2+
open Toolkit
3+
4+
(* Bechamel doesn't provide before/after hooks, just allocate/free, but those are done outside the place where
5+
Bechamel checks for GC live words stabilization.
6+
*)
7+
let before_after ~before ~after ~get ~label ~unit =
8+
let shared_state = Atomic.make None and called = Atomic.make 0 in
9+
let module BeforeAfter = struct
10+
type witness = int Atomic.t
11+
12+
let make () = Atomic.make 0
13+
14+
let load t = Atomic.set t 0
15+
16+
let unload _ = ()
17+
18+
let label _ = label
19+
20+
let unit _ = unit
21+
22+
let get _ =
23+
(*
24+
We get added to the instances both at the beginning and the end, so we get called 4 times:
25+
26+
get () - 0: None -> state := before ()
27+
time ()
28+
get () - 1
29+
30+
benchmark_loop ()
31+
32+
get () - 2
33+
time ()
34+
get () - 3, after state, state := None
35+
36+
We want the time measurement to be as close to the benchmark loop as possible,
37+
so we perform operations only on call 1 and 4
38+
*)
39+
let phase = Atomic.fetch_and_add called 1 mod 4 in
40+
let old = Atomic.get shared_state in
41+
match (old, phase) with
42+
| None, 0 ->
43+
before () |> Option.some |> Atomic.set shared_state ;
44+
0.
45+
| Some state, (1 | 2) ->
46+
get state
47+
| Some state, 3 ->
48+
let r = get state in
49+
Atomic.set shared_state None ;
50+
after state ;
51+
r
52+
| None, _ ->
53+
assert false
54+
| Some _, _ ->
55+
assert false
56+
end in
57+
let measure = Measure.register (module BeforeAfter) in
58+
Measure.instance (module BeforeAfter) measure
59+
60+
let skip_label = "workload"
61+
62+
let thread_workload ~before ~run ~after =
63+
let before () =
64+
let state = before ()
65+
and stop = Atomic.make false
66+
and loops = Atomic.make 0 in
67+
let thread_worker () =
68+
while not (Atomic.get stop) do
69+
Sys.opaque_identity (run state : unit) ;
70+
Atomic.incr loops
71+
done
72+
in
73+
let t = Thread.create thread_worker () in
74+
(state, stop, loops, t)
75+
and after (state, stop, _loops, worker) =
76+
Atomic.set stop true ; Thread.join worker ; after state
77+
and get (_, _, loops, _) = Atomic.fetch_and_add loops 1 |> float_of_int in
78+
before_after ~before ~after ~get ~label:skip_label ~unit:"loops"
79+
80+
(* based on bechamel example code *)
81+
82+
(* For very short benchmarks ensure that they get to run long enough to switch threads
83+
a few times.
84+
Bechamel has both an iteration count and time limit, so this won't be a problem for slower benchmarks.
85+
*)
86+
let limit = 10_000_000
87+
88+
let benchmark ~instances tests =
89+
let cfg = Benchmark.cfg ~limit ~quota:(Time.second 10.0) () in
90+
Benchmark.all cfg instances tests
91+
92+
let analyze ~instances raw_results =
93+
let ols ~bootstrap =
94+
Analyze.ols ~bootstrap ~r_square:true ~predictors:[|Measure.run|]
95+
in
96+
let results =
97+
List.map
98+
(fun instance ->
99+
let f bootstrap = Analyze.all (ols ~bootstrap) instance raw_results in
100+
try f 3000 with _ -> f 0
101+
)
102+
instances
103+
in
104+
(Analyze.merge (ols ~bootstrap:3000) instances results, raw_results)
105+
106+
open Notty_unix
107+
108+
let img (window, results) =
109+
Bechamel_notty.Multiple.image_of_ols_results ~rect:window
110+
~predictor:Measure.run results
111+
|> eol
112+
113+
let not_workload measure = not (Measure.label measure = skip_label)
114+
115+
let run_and_print instances tests =
116+
let results, _ =
117+
tests
118+
|> benchmark ~instances
119+
|> analyze ~instances:(List.filter not_workload instances)
120+
in
121+
let window =
122+
match winsize Unix.stdout with
123+
| Some (w, h) ->
124+
{Bechamel_notty.w; h}
125+
| None ->
126+
{Bechamel_notty.w= 80; h= 1}
127+
in
128+
img (window, results) |> eol |> output_image ;
129+
results
130+
|> Hashtbl.iter @@ fun label results ->
131+
if label = Measure.label Instance.monotonic_clock then
132+
let units = Bechamel_notty.Unit.unit_of_label label in
133+
results
134+
|> Hashtbl.iter @@ fun name ols ->
135+
Format.printf "%s (%s):@, %a@." name units Analyze.OLS.pp ols
136+
137+
let cli ?(always = []) ?(workloads = []) tests =
138+
let instances =
139+
always
140+
@ Instance.[monotonic_clock; minor_allocated; major_allocated]
141+
@ always
142+
in
143+
List.iter (fun i -> Bechamel_notty.Unit.add i (Measure.unit i)) instances ;
144+
Format.printf "@,Running benchmarks (no workloads)@." ;
145+
run_and_print instances tests ;
146+
147+
if workloads <> [] then (
148+
Format.printf "@,Running benchmarks (workloads)@." ;
149+
List.iter (fun i -> Bechamel_notty.Unit.add i (Measure.unit i)) workloads ;
150+
(* workloads come first, so that we unpause them in time *)
151+
let instances = workloads @ instances @ workloads in
152+
run_and_print instances tests
153+
)

ocaml/tests/bench/bench_tracing.ml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
open Bechamel
2+
3+
let ( let@ ) f x = f x
4+
5+
(* TODO: before *)
6+
7+
let trace_test_inner span =
8+
let@ span =
9+
Tracing.with_child_trace
10+
~attributes:[("foo", "testing")]
11+
span ~name:__FUNCTION__
12+
in
13+
let@ _ =
14+
Tracing.with_child_trace ~attributes:[("bar", "val")] span ~name:"test"
15+
in
16+
Sys.opaque_identity ignore ()
17+
18+
let trace_test_span _ = Tracing.with_tracing ~name:__FUNCTION__ trace_test_inner
19+
20+
let trace_test_off _ = trace_test_inner None
21+
22+
let uuid = "TEST"
23+
24+
let export_thread =
25+
(* need to ensure this isn't running outside the benchmarked section,
26+
or bechamel might fail with 'Failed to stabilize GC'
27+
*)
28+
let after _ = Tracing_export.flush_and_exit () in
29+
Bechamel_simple_cli.thread_workload ~before:Tracing_export.main ~after
30+
~run:ignore
31+
32+
let workload1 =
33+
Bechamel_simple_cli.thread_workload ~before:ignore ~after:ignore
34+
~run:trace_test_span
35+
36+
let create_gc_work =
37+
let a = Array.make 1_000 "" in
38+
fun () ->
39+
(* create work for the GC by continously creating a lot of short lived strings *)
40+
Sys.opaque_identity (Array.iteri (fun i _ -> a.(i) <- String.make 2 'x') a)
41+
42+
let workload2 =
43+
Bechamel_simple_cli.thread_workload ~before:ignore ~after:ignore
44+
~run:create_gc_work
45+
46+
let workloads = [workload1; workload2]
47+
48+
let allocate () =
49+
Tracing.TracerProvider.create ~enabled:true ~attributes:[] ~endpoints:[]
50+
~name_label:__MODULE__ ~uuid ;
51+
Tracing_export.main ()
52+
53+
let free t =
54+
Tracing.TracerProvider.destroy ~uuid ;
55+
Tracing_export.flush_and_exit () ;
56+
Thread.join t
57+
58+
let test_tracing_on ?(overflow = false) ~name f =
59+
let allocate () =
60+
if overflow then (
61+
Tracing.Spans.set_max_spans 10 ;
62+
Tracing.Spans.set_max_traces 10
63+
) ;
64+
allocate ()
65+
and free t =
66+
if overflow then (
67+
Tracing.Spans.set_max_spans Bechamel_simple_cli.limit ;
68+
Tracing.Spans.set_max_traces Bechamel_simple_cli.limit
69+
) ;
70+
free t
71+
in
72+
Test.make_with_resource ~name ~allocate ~free Test.uniq f
73+
74+
let benchmarks =
75+
Tracing.Spans.set_max_spans Bechamel_simple_cli.limit ;
76+
Tracing.Spans.set_max_traces Bechamel_simple_cli.limit ;
77+
Test.make_grouped ~name:"tracing"
78+
[
79+
Test.make ~name:"overhead(off)" (Staged.stage trace_test_off)
80+
; test_tracing_on ~name:"overhead(on, no span)" (Staged.stage trace_test_off)
81+
; test_tracing_on ~name:"overhead(on, create span)"
82+
(Staged.stage trace_test_span)
83+
; test_tracing_on ~overflow:true ~name:"max span overflow"
84+
(Staged.stage trace_test_span)
85+
]
86+
87+
let () = Bechamel_simple_cli.cli ~always:[export_thread] ~workloads benchmarks

ocaml/tests/bench/dune

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
(executable
2+
(name bench_tracing)
3+
(libraries tracing bechamel bechamel-notty notty.unix tracing_export threads.posix fmt notty)
4+
)

0 commit comments

Comments
 (0)