Skip to content

Commit 3cfa542

Browse files
committed
benchmarks/gc_latency: add new microbenchmark for tricky mutator latency issues
Gc_latency is a modified version of a program that tickled multiple latency glitches in the Go GC/runtime. This version reports the time of the worst observed glitches so that they can be easily located in a trace file and debugged. This program can also be run as a benchmark to allow easier automated performance monitoring; by default the benchmark doesn't report worst case times because those are too noisy. Updates golang/go#27732. Change-Id: I19b9060f24cda1547b8d75f762316dd5271e32c6 Reviewed-on: https://go-review.googlesource.com/c/benchmarks/+/372256 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Austin Clements <[email protected]> Run-TryBot: David Chase <[email protected]>
1 parent 6a4c432 commit 3cfa542

File tree

3 files changed

+306
-0
lines changed

3 files changed

+306
-0
lines changed

gc_latency/latency.go

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
"flag"
9+
"fmt"
10+
"os"
11+
"runtime"
12+
"runtime/trace"
13+
"sort"
14+
"testing"
15+
"time"
16+
"unsafe"
17+
)
18+
19+
const (
20+
bufferLen = 200_000
21+
warmupCount = 1_000_000
22+
runCount = 5_000_000
23+
)
24+
25+
type kbyte []byte
26+
type circularBuffer [bufferLen]kbyte
27+
28+
type LB struct {
29+
// Performance measurement stuff
30+
delays []time.Duration // delays observed (for distribution)
31+
worst time.Duration // worst delay observed
32+
33+
// For making sense of the bad outcome.
34+
total time.Duration // total time spent in allocations
35+
allStart time.Time // time (very nearly) at which the trace begins
36+
worstIndex int // index of worst case allocation delay
37+
worstElapsed time.Duration // duration of worst case allocation delay
38+
39+
sink *circularBuffer // assign a pointer here to ensure heap allocation
40+
41+
// How to allocate
42+
43+
// "Fluff" refers to allocating a small fraction of extra quickly-dead objects
44+
// to break up long runs on not-free objects that were once a cause of allocation latency.
45+
doFluff bool
46+
// "Fluff" allocations are all assigned to fluff, so that they are on-heap, but only the last one is live.
47+
fluff kbyte
48+
49+
// The circular buffer can be on the heap, in a global, or on stack.
50+
// This choice affects allocation latency.
51+
howAllocated string
52+
}
53+
54+
// globalBuffer is the globally-allocated circular buffer,
55+
// for measuring the cost of scanning large global objects.
56+
var globalBuffer circularBuffer
57+
58+
// These three methods pass a differently-allocated circularBuffer
59+
// to the benchmarked "work" to see how that affects allocation tail latency.
60+
61+
//go:noinline
62+
func (lb *LB) global(count int) {
63+
lb.work(&globalBuffer, count)
64+
for i := range globalBuffer {
65+
globalBuffer[i] = nil
66+
}
67+
}
68+
69+
//go:noinline
70+
func (lb *LB) heap(count int) {
71+
c := &circularBuffer{}
72+
lb.sink = c // force to heap
73+
lb.work(c, count)
74+
lb.sink = nil
75+
}
76+
77+
//go:noinline
78+
func (lb *LB) stack(count int) {
79+
var c circularBuffer
80+
lb.work(&c, count)
81+
}
82+
83+
// newSlice allocates a 1k slice of bytes and initializes them all to byte(n)
84+
func (lb *LB) newSlice(n int) kbyte {
85+
m := make(kbyte, 1024)
86+
if lb.doFluff && n&63 == 0 {
87+
lb.fluff = make(kbyte, 1024)
88+
}
89+
for i := range m {
90+
m[i] = byte(n)
91+
}
92+
return m
93+
}
94+
95+
// storeSlice stores a newly allocated 1k slice of bytes at c[count%len(c)]
96+
// It also checks the time needed to do this and records the worst case.
97+
func (lb *LB) storeSlice(c *circularBuffer, count int) {
98+
start := time.Now()
99+
c[count%len(c)] = lb.newSlice(count)
100+
elapsed := time.Since(start)
101+
102+
candElapsed := time.Since(lb.allStart) // Record location of worst in trace
103+
if elapsed > lb.worst {
104+
lb.worst = elapsed
105+
lb.worstIndex = count
106+
lb.worstElapsed = candElapsed
107+
}
108+
lb.total = time.Duration(lb.total.Nanoseconds() + elapsed.Nanoseconds())
109+
lb.delays = append(lb.delays, elapsed)
110+
}
111+
112+
//go:noinline
113+
func (lb *LB) work(c *circularBuffer, count int) {
114+
for i := 0; i < count; i++ {
115+
lb.storeSlice(c, i)
116+
}
117+
}
118+
119+
func (lb *LB) doAllocations(count int) {
120+
switch lb.howAllocated {
121+
case "stack":
122+
lb.stack(count)
123+
case "heap":
124+
lb.heap(count)
125+
case "global":
126+
lb.global(count)
127+
}
128+
}
129+
130+
var traceFile string
131+
132+
func flags() (string, bool) {
133+
var howAllocated = "stack"
134+
var doFluff bool
135+
flag.StringVar(&traceFile, "trace", traceFile, "name of trace file to create")
136+
flag.StringVar(&howAllocated, "how", howAllocated, "how the buffer is allocated = {stack,heap,global}")
137+
flag.BoolVar(&doFluff, "fluff", doFluff, "insert 'fluff' into allocation runs to break up sweeps")
138+
139+
flag.Parse()
140+
141+
switch howAllocated {
142+
case "stack", "heap", "global":
143+
break
144+
default:
145+
fmt.Fprintf(os.Stderr, "-how needs to be one of 'heap', 'stack' or 'global, saw '%s' instead\n", howAllocated)
146+
os.Exit(1)
147+
}
148+
return howAllocated, doFluff
149+
}
150+
151+
var reportWorstFlag bool
152+
153+
func (lb0 *LB) bench(b *testing.B) {
154+
if b != nil {
155+
b.StopTimer()
156+
}
157+
158+
var c *circularBuffer = &circularBuffer{}
159+
lb0.sink = c // force heap allocation
160+
lb0.delays = make([]time.Duration, 0, runCount)
161+
// Warm up heap, virtual memory, address space, etc.
162+
lb0.work(c, warmupCount)
163+
c, lb0.sink = nil, nil
164+
runtime.GC() // Start fresh, GC with all the timers turned off.
165+
166+
lb := &LB{doFluff: lb0.doFluff, howAllocated: lb0.howAllocated, delays: lb0.delays[:0]}
167+
count := runCount
168+
169+
// Confine tracing and timing defers to a small block.
170+
run := func() {
171+
if traceFile != "" {
172+
f, err := os.Create(traceFile)
173+
if err != nil {
174+
if b != nil {
175+
b.Fatalf("Could not create trace file '%s'\n", traceFile)
176+
} else {
177+
fmt.Fprintf(os.Stderr, "Could not create trace file '%s'\n", traceFile)
178+
os.Exit(1)
179+
}
180+
}
181+
defer f.Close()
182+
trace.Start(f)
183+
defer trace.Stop()
184+
}
185+
lb.allStart = time.Now() // this is for trace file navigation, not benchmark timing.
186+
187+
if b != nil {
188+
count = b.N * count
189+
if b.N > 1 {
190+
lb.delays = make([]time.Duration, 0, count)
191+
}
192+
b.StartTimer()
193+
defer b.StopTimer()
194+
}
195+
lb.doAllocations(count)
196+
}
197+
run()
198+
199+
sort.Slice(lb.delays, func(i, j int) bool { return lb.delays[i] < lb.delays[j] })
200+
delays := lb.delays
201+
delayLen := float64(len(delays))
202+
average, median := time.Duration(lb.total.Nanoseconds()/int64(count)), delays[len(delays)/2]
203+
p29, p39, p49, p59, p69 := lb.delays[int(0.99*delayLen)], delays[int(0.999*delayLen)], delays[int(0.9999*delayLen)], delays[int(0.99999*delayLen)], delays[int(0.999999*delayLen)]
204+
if b != nil {
205+
b.ReportMetric(float64(average.Nanoseconds()), "ns/op")
206+
b.ReportMetric(float64(median), "p50-ns")
207+
b.ReportMetric(float64(p29), "p99-ns")
208+
b.ReportMetric(float64(p39), "p99.9-ns")
209+
b.ReportMetric(float64(p49), "p99.99-ns")
210+
b.ReportMetric(float64(p59), "p99.999-ns")
211+
b.ReportMetric(float64(p69), "p99.9999-ns")
212+
if reportWorstFlag {
213+
b.ReportMetric(float64(lb.worst), "worst")
214+
}
215+
// Don't report worst case, it is ultra-noisy.
216+
} else {
217+
fmt.Printf("GC latency benchmark, how=%s, fluff=%v\n", lb.howAllocated, lb.doFluff)
218+
fmt.Println("Worst allocation latency:", lb.worst)
219+
fmt.Println("Worst allocation index:", lb.worstIndex)
220+
fmt.Println("Worst allocation occurs at run elapsed time:", lb.worstElapsed)
221+
fmt.Println("Average allocation latency:", average)
222+
fmt.Println("Median allocation latency:", median)
223+
fmt.Println("99% allocation latency:", p29)
224+
fmt.Println("99.9% allocation latency:", p39)
225+
fmt.Println("99.99% allocation latency:", p49)
226+
fmt.Println("99.999% allocation latency:", p59)
227+
fmt.Println("99.9999% allocation latency:", p69)
228+
fmt.Println("Sizeof(circularBuffer) =", unsafe.Sizeof(*c))
229+
fmt.Println("Approximate live memory =", unsafe.Sizeof(*c)+bufferLen*1024)
230+
}
231+
}

gc_latency/latency_test.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
import (
8+
"flag"
9+
"fmt"
10+
"os"
11+
"testing"
12+
)
13+
14+
// Run as a test, reports allocation time statistics for stack, heap, and globally
15+
// allocated buffers, out to the 99.9999th percentile. Optionally reports worst
16+
// allocation time if -worst is specified, but this is normally too noisy for any
17+
// sort of trend tracking or alerting. The default test usually runs long enough that
18+
// it requires only one iteration.
19+
20+
func TestMain(m *testing.M) {
21+
flag.BoolVar(&reportWorstFlag, "worst", false, "report otherwise too-noisy 'worst' metric in benchmark")
22+
flag.Parse()
23+
os.Exit(m.Run())
24+
}
25+
26+
type testCase struct {
27+
howAlloc string
28+
withFluff bool
29+
}
30+
31+
func BenchmarkGCLatency(b *testing.B) {
32+
tcs := []testCase{
33+
{"stack", false},
34+
{"stack", true},
35+
{"heap", false},
36+
{"heap", true},
37+
{"global", false},
38+
{"global", true},
39+
}
40+
41+
for _, tc := range tcs {
42+
lb := &LB{doFluff: tc.withFluff, howAllocated: tc.howAlloc}
43+
b.Run(fmt.Sprintf("how=%s/fluff=%v", tc.howAlloc, tc.withFluff),
44+
func(b *testing.B) { lb.bench(b) })
45+
}
46+
}

gc_latency/main.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
// Copyright 2023 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
package main
6+
7+
// Gc_latency is a modified version of a program that tickled multiple
8+
// latency glitches in the Go GC/runtime. This version reports the time
9+
// of the worst observed glitches so that they can be easily located in
10+
// a trace file and debugged. This program can also be run as a benchmark
11+
// to allow easier automated performance monitoring; the benchmark doesn't
12+
// report worst case times because those are too noisy.
13+
//
14+
// Usage:
15+
//
16+
// gc_latency [flags]
17+
//
18+
// Flags (as main):
19+
// -fluff
20+
// insert 'fluff' into allocation runs to break up sweeps
21+
// -how string
22+
// how the buffer is allocated = {stack,heap,global} (default "stack")
23+
// -trace string
24+
// name of trace file to create
25+
func main() {
26+
howAllocated, doFluffFlag := flags()
27+
lb := &LB{howAllocated: howAllocated, doFluff: doFluffFlag}
28+
lb.bench(nil)
29+
}

0 commit comments

Comments
 (0)