Skip to content

Commit 63ceff9

Browse files
mknyszekgopherbot
authored andcommitted
runtime/metrics: add /sync/mutex/wait/total:seconds metric
This change adds a metric to the runtime/metrics package which tracks total mutex wait time for sync.Mutex and sync.RWMutex. The purpose of this metric is to be able to quickly get an idea of the total mutex wait time. The implementation of this metric piggybacks off of the existing G runnable tracking infrastructure, as well as the wait reason set on a G when it goes into _Gwaiting. Fixes #49881. Change-Id: I4691abf64ac3574bec69b4d7d4428b1573130517 Reviewed-on: https://go-review.googlesource.com/c/go/+/427618 Reviewed-by: Michael Pratt <[email protected]> Auto-Submit: Michael Knyszek <[email protected]> Run-TryBot: Michael Knyszek <[email protected]> TryBot-Result: Gopher Robot <[email protected]>
1 parent 686b38b commit 63ceff9

File tree

7 files changed

+221
-25
lines changed

7 files changed

+221
-25
lines changed

src/runtime/export_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,12 @@ func Getg() *G {
525525
return getg()
526526
}
527527

528+
func GIsWaitingOnMutex(gp *G) bool {
529+
return readgstatus(gp) == _Gwaiting && gp.waitreason.isMutexWait()
530+
}
531+
532+
var CasGStatusAlwaysTrack = &casgstatusAlwaysTrack
533+
528534
//go:noinline
529535
func PanicForTesting(b []byte, i int) byte {
530536
return unexportedPanicForTesting(b, i)
@@ -1617,3 +1623,5 @@ func (s *ScavengeIndex) Mark(base, limit uintptr) {
16171623
func (s *ScavengeIndex) Clear(ci ChunkIdx) {
16181624
s.i.clear(chunkIdx(ci))
16191625
}
1626+
1627+
const GTrackingPeriod = gTrackingPeriod

src/runtime/metrics.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,12 @@ func initMetrics() {
411411
hist.counts[len(hist.counts)-1] = sched.timeToRun.overflow.Load()
412412
},
413413
},
414+
"/sync/mutex/wait/total:seconds": {
415+
compute: func(_ *statAggregate, out *metricValue) {
416+
out.kind = metricKindFloat64
417+
out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
418+
},
419+
},
414420
}
415421
metricsInit = true
416422
}

src/runtime/metrics/description.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,12 @@ var allDesc = []Description{
366366
Description: "Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.",
367367
Kind: KindFloat64Histogram,
368368
},
369+
{
370+
Name: "/sync/mutex/wait/total:seconds",
371+
Description: "Approximate cumulative time goroutines have spent blocked on a sync.Mutex or sync.RWMutex. This metric is useful for identifying global changes in lock contention. Collect a mutex or block profile using the runtime/pprof package for more detailed contention data.",
372+
Kind: KindFloat64,
373+
Cumulative: true,
374+
},
369375
}
370376

371377
// All returns a slice of containing metric descriptions for all supported metrics.

src/runtime/metrics/doc.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,5 +272,12 @@ Below is the full list of supported metrics, ordered lexicographically.
272272
/sched/latencies:seconds
273273
Distribution of the time goroutines have spent in the scheduler
274274
in a runnable state before actually running.
275+
276+
/sync/mutex/wait/total:seconds
277+
Approximate cumulative time goroutines have spent blocked on a
278+
sync.Mutex or sync.RWMutex. This metric is useful for identifying
279+
global changes in lock contention. Collect a mutex or block
280+
profile using the runtime/pprof package for more detailed
281+
contention data.
275282
*/
276283
package metrics

src/runtime/metrics_test.go

Lines changed: 126 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
package runtime_test
66

77
import (
8+
"reflect"
89
"runtime"
910
"runtime/metrics"
1011
"sort"
@@ -156,7 +157,7 @@ func TestReadMetricsConsistency(t *testing.T) {
156157
// Tests whether readMetrics produces consistent, sensible values.
157158
// The values are read concurrently with the runtime doing other
158159
// things (e.g. allocating) so what we read can't reasonably compared
159-
// to runtime values.
160+
// to other runtime values (e.g. MemStats).
160161

161162
// Run a few GC cycles to get some of the stats to be non-zero.
162163
runtime.GC()
@@ -486,3 +487,127 @@ func TestReadMetricsCumulative(t *testing.T) {
486487
func withinEpsilon(v1, v2, e float64) bool {
487488
return v2-v2*e <= v1 && v1 <= v2+v2*e
488489
}
490+
491+
func TestMutexWaitTimeMetric(t *testing.T) {
492+
var sample [1]metrics.Sample
493+
sample[0].Name = "/sync/mutex/wait/total:seconds"
494+
495+
locks := []locker2{
496+
new(mutex),
497+
new(rwmutexWrite),
498+
new(rwmutexReadWrite),
499+
new(rwmutexWriteRead),
500+
}
501+
for _, lock := range locks {
502+
t.Run(reflect.TypeOf(lock).Elem().Name(), func(t *testing.T) {
503+
metrics.Read(sample[:])
504+
before := time.Duration(sample[0].Value.Float64() * 1e9)
505+
506+
minMutexWaitTime := generateMutexWaitTime(lock)
507+
508+
metrics.Read(sample[:])
509+
after := time.Duration(sample[0].Value.Float64() * 1e9)
510+
511+
if wt := after - before; wt < minMutexWaitTime {
512+
t.Errorf("too little mutex wait time: got %s, want %s", wt, minMutexWaitTime)
513+
}
514+
})
515+
}
516+
}
517+
518+
// locker2 represents an API surface of two concurrent goroutines
519+
// locking the same resource, but through different APIs. It's intended
520+
// to abstract over the relationship of two Lock calls or an RLock
521+
// and a Lock call.
522+
type locker2 interface {
523+
Lock1()
524+
Unlock1()
525+
Lock2()
526+
Unlock2()
527+
}
528+
529+
type mutex struct {
530+
mu sync.Mutex
531+
}
532+
533+
func (m *mutex) Lock1() { m.mu.Lock() }
534+
func (m *mutex) Unlock1() { m.mu.Unlock() }
535+
func (m *mutex) Lock2() { m.mu.Lock() }
536+
func (m *mutex) Unlock2() { m.mu.Unlock() }
537+
538+
type rwmutexWrite struct {
539+
mu sync.RWMutex
540+
}
541+
542+
func (m *rwmutexWrite) Lock1() { m.mu.Lock() }
543+
func (m *rwmutexWrite) Unlock1() { m.mu.Unlock() }
544+
func (m *rwmutexWrite) Lock2() { m.mu.Lock() }
545+
func (m *rwmutexWrite) Unlock2() { m.mu.Unlock() }
546+
547+
type rwmutexReadWrite struct {
548+
mu sync.RWMutex
549+
}
550+
551+
func (m *rwmutexReadWrite) Lock1() { m.mu.RLock() }
552+
func (m *rwmutexReadWrite) Unlock1() { m.mu.RUnlock() }
553+
func (m *rwmutexReadWrite) Lock2() { m.mu.Lock() }
554+
func (m *rwmutexReadWrite) Unlock2() { m.mu.Unlock() }
555+
556+
type rwmutexWriteRead struct {
557+
mu sync.RWMutex
558+
}
559+
560+
func (m *rwmutexWriteRead) Lock1() { m.mu.Lock() }
561+
func (m *rwmutexWriteRead) Unlock1() { m.mu.Unlock() }
562+
func (m *rwmutexWriteRead) Lock2() { m.mu.RLock() }
563+
func (m *rwmutexWriteRead) Unlock2() { m.mu.RUnlock() }
564+
565+
// generateMutexWaitTime causes a couple of goroutines
566+
// to block a whole bunch of times on a sync.Mutex, returning
567+
// the minimum amount of time that should be visible in the
568+
// /sync/mutex-wait:seconds metric.
569+
func generateMutexWaitTime(mu locker2) time.Duration {
570+
// Set up the runtime to always track casgstatus transitions for metrics.
571+
*runtime.CasGStatusAlwaysTrack = true
572+
573+
mu.Lock1()
574+
575+
// Start up a goroutine to wait on the lock.
576+
gc := make(chan *runtime.G)
577+
done := make(chan bool)
578+
go func() {
579+
gc <- runtime.Getg()
580+
581+
for {
582+
mu.Lock2()
583+
mu.Unlock2()
584+
if <-done {
585+
return
586+
}
587+
}
588+
}()
589+
gp := <-gc
590+
591+
// Set the block time high enough so that it will always show up, even
592+
// on systems with coarse timer granularity.
593+
const blockTime = 100 * time.Millisecond
594+
595+
// Make sure the goroutine spawned above actually blocks on the lock.
596+
for {
597+
if runtime.GIsWaitingOnMutex(gp) {
598+
break
599+
}
600+
runtime.Gosched()
601+
}
602+
603+
// Let some amount of time pass.
604+
time.Sleep(blockTime)
605+
606+
// Let the other goroutine acquire the lock.
607+
mu.Unlock1()
608+
done <- true
609+
610+
// Reset flag.
611+
*runtime.CasGStatusAlwaysTrack = false
612+
return blockTime
613+
}

src/runtime/proc.go

Lines changed: 57 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,10 @@ func castogscanstatus(gp *g, oldval, newval uint32) bool {
954954
panic("not reached")
955955
}
956956

957+
// casgstatusAlwaysTrack is a debug flag that causes casgstatus to always track
958+
// various latencies on every transition instead of sampling them.
959+
var casgstatusAlwaysTrack = false
960+
957961
// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
958962
// and casfrom_Gscanstatus instead.
959963
// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
@@ -994,43 +998,73 @@ func casgstatus(gp *g, oldval, newval uint32) {
994998
}
995999
}
9961000

997-
// Handle tracking for scheduling latencies.
9981001
if oldval == _Grunning {
999-
// Track every 8th time a goroutine transitions out of running.
1000-
if gp.trackingSeq%gTrackingPeriod == 0 {
1002+
// Track every gTrackingPeriod time a goroutine transitions out of running.
1003+
if casgstatusAlwaysTrack || gp.trackingSeq%gTrackingPeriod == 0 {
10011004
gp.tracking = true
10021005
}
10031006
gp.trackingSeq++
10041007
}
1005-
if gp.tracking {
1006-
if oldval == _Grunnable {
1007-
// We transitioned out of runnable, so measure how much
1008-
// time we spent in this state and add it to
1009-
// runnableTime.
1010-
now := nanotime()
1011-
gp.runnableTime += now - gp.runnableStamp
1012-
gp.runnableStamp = 0
1008+
if !gp.tracking {
1009+
return
1010+
}
1011+
1012+
// Handle various kinds of tracking.
1013+
//
1014+
// Currently:
1015+
// - Time spent in runnable.
1016+
// - Time spent blocked on a sync.Mutex or sync.RWMutex.
1017+
switch oldval {
1018+
case _Grunnable:
1019+
// We transitioned out of runnable, so measure how much
1020+
// time we spent in this state and add it to
1021+
// runnableTime.
1022+
now := nanotime()
1023+
gp.runnableTime += now - gp.trackingStamp
1024+
gp.trackingStamp = 0
1025+
case _Gwaiting:
1026+
if !gp.waitreason.isMutexWait() {
1027+
// Not blocking on a lock.
1028+
break
10131029
}
1014-
if newval == _Grunnable {
1015-
// We just transitioned into runnable, so record what
1016-
// time that happened.
1017-
now := nanotime()
1018-
gp.runnableStamp = now
1019-
} else if newval == _Grunning {
1020-
// We're transitioning into running, so turn off
1021-
// tracking and record how much time we spent in
1022-
// runnable.
1023-
gp.tracking = false
1024-
sched.timeToRun.record(gp.runnableTime)
1025-
gp.runnableTime = 0
1030+
// Blocking on a lock, measure it. Note that because we're
1031+
// sampling, we have to multiply by our sampling period to get
1032+
// a more representative estimate of the absolute value.
1033+
// gTrackingPeriod also represents an accurate sampling period
1034+
// because we can only enter this state from _Grunning.
1035+
now := nanotime()
1036+
sched.totalMutexWaitTime.Add((now - gp.trackingStamp) * gTrackingPeriod)
1037+
gp.trackingStamp = 0
1038+
}
1039+
switch newval {
1040+
case _Gwaiting:
1041+
if !gp.waitreason.isMutexWait() {
1042+
// Not blocking on a lock.
1043+
break
10261044
}
1045+
// Blocking on a lock. Write down the timestamp.
1046+
now := nanotime()
1047+
gp.trackingStamp = now
1048+
case _Grunnable:
1049+
// We just transitioned into runnable, so record what
1050+
// time that happened.
1051+
now := nanotime()
1052+
gp.trackingStamp = now
1053+
case _Grunning:
1054+
// We're transitioning into running, so turn off
1055+
// tracking and record how much time we spent in
1056+
// runnable.
1057+
gp.tracking = false
1058+
sched.timeToRun.record(gp.runnableTime)
1059+
gp.runnableTime = 0
10271060
}
10281061
}
10291062

10301063
// casGToWaiting transitions gp from old to _Gwaiting, and sets the wait reason.
10311064
//
10321065
// Use this over casgstatus when possible to ensure that a waitreason is set.
10331066
func casGToWaiting(gp *g, old uint32, reason waitReason) {
1067+
// Set the wait reason before calling casgstatus, because casgstatus will use it.
10341068
gp.waitreason = reason
10351069
casgstatus(gp, old, _Gwaiting)
10361070
}

src/runtime/runtime2.go

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,7 @@ type g struct {
468468
sysblocktraced bool // StartTrace has emitted EvGoInSyscall about this goroutine
469469
tracking bool // whether we're tracking this G for sched latency statistics
470470
trackingSeq uint8 // used to decide whether to track this G
471-
runnableStamp int64 // timestamp of when the G last became runnable, only used when tracking
471+
trackingStamp int64 // timestamp of when the G last started being tracked
472472
runnableTime int64 // the amount of time spent runnable, cleared when running, only used when tracking
473473
sysexitticks int64 // cputicks when syscall has returned (for tracing)
474474
traceseq uint64 // trace event sequencer
@@ -843,6 +843,10 @@ type schedt struct {
843843
//
844844
// Reset on each GC cycle.
845845
idleTime atomic.Int64
846+
847+
// totalMutexWaitTime is the sum of time goroutines have spent in _Gwaiting
848+
// with a waitreason of the form waitReasonSync{RW,}Mutex{R,}Lock.
849+
totalMutexWaitTime atomic.Int64
846850
}
847851

848852
// Values for the flags field of a sigTabT.
@@ -1109,6 +1113,12 @@ func (w waitReason) String() string {
11091113
return waitReasonStrings[w]
11101114
}
11111115

1116+
func (w waitReason) isMutexWait() bool {
1117+
return w == waitReasonSyncMutexLock ||
1118+
w == waitReasonSyncRWMutexRLock ||
1119+
w == waitReasonSyncRWMutexLock
1120+
}
1121+
11121122
var (
11131123
allm *m
11141124
gomaxprocs int32

0 commit comments

Comments
 (0)