Skip to content

Commit ba1c5b2

Browse files
rhyshgopherbot
authored andcommitted
runtime: profile mutex contention during unlock
When an M's use of a lock causes delays in other Ms, capture the stack of the unlock call that caused the delay. This makes the meaning of the mutex profile for runtime-internal mutexes match the behavior for sync.Mutex: the profile points to the end of the critical section that is responsible for delaying other work. Fixes #66999 Change-Id: I4abc4a1df00a48765d29c07776481a1cbd539ff8 Reviewed-on: https://go-review.googlesource.com/c/go/+/585638 Auto-Submit: Rhys Hiltner <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Carlos Amedee <[email protected]> Reviewed-by: Michael Pratt <[email protected]>
1 parent d881ed6 commit ba1c5b2

File tree

3 files changed

+80
-128
lines changed

3 files changed

+80
-128
lines changed

src/runtime/lock_futex.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,6 @@ func lock2(l *mutex) {
7272
return
7373
}
7474

75-
timer := &lockTimer{lock: l}
76-
timer.begin()
77-
7875
// If a goroutine's stack needed to grow during a lock2 call, the M could
7976
// end up with two active lock2 calls (one each on curg and g0). If both are
8077
// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -83,6 +80,7 @@ func lock2(l *mutex) {
8380
gp.stackguard0, gp.throwsplit = stackPreempt, true
8481
}
8582

83+
gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
8684
// On uniprocessors, no point spinning.
8785
// On multiprocessors, spin for ACTIVE_SPIN attempts.
8886
spin := 0
@@ -114,14 +112,14 @@ Loop:
114112

115113
if v == old || atomic.Casuintptr(&l.key, old, v) {
116114
gp.m.mWaitList.clearLinks()
115+
gp.m.mWaitList.acquireTimes = timePair{}
117116
break
118117
}
119118
v = atomic.Loaduintptr(&l.key)
120119
}
121120
if gp == gp.m.curg {
122121
gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
123122
}
124-
timer.end()
125123
return
126124
}
127125
i = 0
@@ -163,6 +161,7 @@ func unlock(l *mutex) {
163161
}
164162

165163
func unlock2(l *mutex) {
164+
now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
166165
for {
167166
v := atomic.Loaduintptr(&l.key)
168167
if v == mutex_locked {
@@ -172,6 +171,12 @@ func unlock2(l *mutex) {
172171
} else if v&mutex_locked == 0 {
173172
throw("unlock of unlocked lock")
174173
} else {
174+
if now != (timePair{}) {
175+
head := muintptr(v &^ (mutex_sleeping | mutex_locked))
176+
dt = claimMutexWaitTime(now, head)
177+
now = timePair{}
178+
}
179+
175180
// Other M's are waiting for the lock.
176181
if atomic.Casuintptr(&l.key, v, v&^mutex_locked) {
177182
futexwakeup(key32(&l.key), 1)
@@ -181,7 +186,7 @@ func unlock2(l *mutex) {
181186
}
182187

183188
gp := getg()
184-
gp.m.mLockProfile.recordUnlock(l)
189+
gp.m.mLockProfile.recordUnlock(dt)
185190
gp.m.locks--
186191
if gp.m.locks < 0 {
187192
throw("runtime·unlock: lock count")

src/runtime/lock_sema.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,6 @@ func lock2(l *mutex) {
5252
}
5353
semacreate(gp.m)
5454

55-
timer := &lockTimer{lock: l}
56-
timer.begin()
57-
5855
// If a goroutine's stack needed to grow during a lock2 call, the M could
5956
// end up with two active lock2 calls (one each on curg and g0). If both are
6057
// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -63,6 +60,7 @@ func lock2(l *mutex) {
6360
gp.stackguard0, gp.throwsplit = stackPreempt, true
6461
}
6562

63+
gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
6664
// On uniprocessor's, no point spinning.
6765
// On multiprocessors, spin for ACTIVE_SPIN attempts.
6866
spin := 0
@@ -90,14 +88,14 @@ Loop:
9088

9189
if v == old || atomic.Casuintptr(&l.key, old, v) {
9290
gp.m.mWaitList.clearLinks()
91+
gp.m.mWaitList.acquireTimes = timePair{}
9392
break
9493
}
9594
v = atomic.Loaduintptr(&l.key)
9695
}
9796
if gp == gp.m.curg {
9897
gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
9998
}
100-
timer.end()
10199
return
102100
}
103101
i = 0
@@ -147,6 +145,7 @@ func unlock(l *mutex) {
147145
//
148146
//go:nowritebarrier
149147
func unlock2(l *mutex) {
148+
now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
150149
gp := getg()
151150
var mp *m
152151
for {
@@ -156,6 +155,11 @@ func unlock2(l *mutex) {
156155
break
157156
}
158157
} else {
158+
if now != (timePair{}) {
159+
dt = claimMutexWaitTime(now, muintptr(v&^locked))
160+
now = timePair{}
161+
}
162+
159163
// Other M's are waiting for the lock.
160164
// Dequeue an M.
161165
mp = muintptr(v &^ locked).ptr()
@@ -166,7 +170,8 @@ func unlock2(l *mutex) {
166170
}
167171
}
168172
}
169-
gp.m.mLockProfile.recordUnlock(l)
173+
174+
gp.m.mLockProfile.recordUnlock(dt)
170175
gp.m.locks--
171176
if gp.m.locks < 0 {
172177
throw("runtime·unlock: lock count")

src/runtime/mprof.go

Lines changed: 60 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -781,99 +781,34 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
781781
return head
782782
}
783783

784-
// lockTimer assists with profiling contention on runtime-internal locks.
785-
//
786-
// There are several steps between the time that an M experiences contention and
787-
// when that contention may be added to the profile. This comes from our
788-
// constraints: We need to keep the critical section of each lock small,
789-
// especially when those locks are contended. The reporting code cannot acquire
790-
// new locks until the M has released all other locks, which means no memory
791-
// allocations and encourages use of (temporary) M-local storage.
792-
//
793-
// The M will have space for storing one call stack that caused contention, and
794-
// for the magnitude of that contention. It will also have space to store the
795-
// magnitude of additional contention the M caused, since it only has space to
796-
// remember one call stack and might encounter several contention events before
797-
// it releases all of its locks and is thus able to transfer the local buffer
798-
// into the profile.
799-
//
800-
// The M will collect the call stack when it unlocks the contended lock. That
801-
// minimizes the impact on the critical section of the contended lock, and
802-
// matches the mutex profile's behavior for contention in sync.Mutex: measured
803-
// at the Unlock method.
804-
//
805-
// The profile for contention on sync.Mutex blames the caller of Unlock for the
806-
// amount of contention experienced by the callers of Lock which had to wait.
807-
// When there are several critical sections, this allows identifying which of
808-
// them is responsible.
809-
//
810-
// Matching that behavior for runtime-internal locks will require identifying
811-
// which Ms are blocked on the mutex. The semaphore-based implementation is
812-
// ready to allow that, but the futex-based implementation will require a bit
813-
// more work. Until then, we report contention on runtime-internal locks with a
814-
// call stack taken from the unlock call (like the rest of the user-space
815-
// "mutex" profile), but assign it a duration value based on how long the
816-
// previous lock call took (like the user-space "block" profile).
817-
//
818-
// Thus, reporting the call stacks of runtime-internal lock contention is
819-
// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
820-
//
821-
// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
822-
//
823-
// The M will track this by storing a pointer to the lock; lock/unlock pairs for
824-
// runtime-internal locks are always on the same M.
825-
//
826-
// Together, that demands several steps for recording contention. First, when
827-
// finally acquiring a contended lock, the M decides whether it should plan to
828-
// profile that event by storing a pointer to the lock in its "to be profiled
829-
// upon unlock" field. If that field is already set, it uses the relative
830-
// magnitudes to weight a random choice between itself and the other lock, with
831-
// the loser's time being added to the "additional contention" field. Otherwise
832-
// if the M's call stack buffer is occupied, it does the comparison against that
833-
// sample's magnitude.
834-
//
835-
// Second, having unlocked a mutex the M checks to see if it should capture the
836-
// call stack into its local buffer. Finally, when the M unlocks its last mutex,
837-
// it transfers the local buffer into the profile. As part of that step, it also
838-
// transfers any "additional contention" time to the profile. Any lock
839-
// contention that it experiences while adding samples to the profile will be
840-
// recorded later as "additional contention" and not include a call stack, to
841-
// avoid an echo.
842-
type lockTimer struct {
843-
lock *mutex
844-
timeRate int64
845-
timeStart int64
846-
tickStart int64
847-
}
848-
849-
func (lt *lockTimer) begin() {
850-
rate := int64(atomic.Load64(&mutexprofilerate))
851-
852-
lt.timeRate = gTrackingPeriod
853-
if rate != 0 && rate < lt.timeRate {
854-
lt.timeRate = rate
855-
}
856-
if int64(cheaprand())%lt.timeRate == 0 {
857-
lt.timeStart = nanotime()
784+
// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at
785+
// head to now, returning an estimate of the total wait time claimed by that
786+
// action.
787+
func claimMutexWaitTime(now timePair, head muintptr) timePair {
788+
fixMutexWaitList(head)
789+
hp := head.ptr()
790+
if hp == nil {
791+
return timePair{}
858792
}
793+
tp := hp.mWaitList.tail.ptr()
794+
waiters := hp.mWaitList.waiters
795+
headTimes := hp.mWaitList.acquireTimes
796+
tailTimes := tp.mWaitList.acquireTimes
859797

860-
if rate > 0 && int64(cheaprand())%rate == 0 {
861-
lt.tickStart = cputicks()
798+
var dt timePair
799+
dt.nanotime = now.nanotime - headTimes.nanotime
800+
dt.cputicks = now.cputicks - headTimes.cputicks
801+
if waiters > 1 {
802+
dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2
803+
dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2
862804
}
863-
}
864805

865-
func (lt *lockTimer) end() {
866-
gp := getg()
806+
// When removeMutexWaitList removes a head or tail node, it's responsible
807+
// for applying these changes to the new head or tail.
808+
hp.mWaitList.acquireTimes = now
809+
tp.mWaitList.acquireTimes = now
867810

868-
if lt.timeStart != 0 {
869-
nowTime := nanotime()
870-
gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
871-
}
872-
873-
if lt.tickStart != 0 {
874-
nowTick := cputicks()
875-
gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
876-
}
811+
return dt
877812
}
878813

879814
// mLockProfile is part of the M struct to hold information relating to mutex
@@ -890,18 +825,49 @@ func (lt *lockTimer) end() {
890825
// not include a call stack, to avoid an echo.
891826
type mLockProfile struct {
892827
waitTime atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
893-
stack []uintptr // stack that experienced contention in runtime.lockWithRank
894-
pending uintptr // *mutex that experienced contention (to be traceback-ed)
895-
cycles int64 // cycles attributable to "pending" (if set), otherwise to "stack"
828+
stack []uintptr // unlock stack that caused delay in other Ms' runtime.lockWithRank
829+
cycles int64 // cycles attributable to "stack"
896830
cyclesLost int64 // contention for which we weren't able to record a call stack
897831
disabled bool // attribute all time to "lost"
898832
}
899833

900-
func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
834+
// recordUnlock considers the current unlock call (which caused a total of dt
835+
// delay in other Ms) for later inclusion in the mutex contention profile. If
836+
// this M holds no other locks, it transfers the buffered contention record to
837+
// the mutex contention profile.
838+
//
839+
// From unlock2, we might not be holding a p in this code.
840+
//
841+
//go:nowritebarrierrec
842+
func (prof *mLockProfile) recordUnlock(dt timePair) {
843+
if dt != (timePair{}) {
844+
// We could make a point of clearing out the local storage right before
845+
// this, to have a slightly better chance of being able to see the call
846+
// stack if the program has several (nested) contended locks. If apps
847+
// are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll
848+
// be a worthwhile change.
849+
prof.proposeUnlock(dt)
850+
}
851+
if getg().m.locks == 1 && prof.cycles != 0 {
852+
prof.store()
853+
}
854+
}
855+
856+
func (prof *mLockProfile) proposeUnlock(dt timePair) {
857+
if nanos := dt.nanotime; nanos > 0 {
858+
prof.waitTime.Add(nanos)
859+
}
860+
861+
cycles := dt.cputicks
901862
if cycles <= 0 {
902863
return
903864
}
904865

866+
rate := int64(atomic.Load64(&mutexprofilerate))
867+
if rate <= 0 || int64(cheaprand())%rate != 0 {
868+
return
869+
}
870+
905871
if prof.disabled {
906872
// We're experiencing contention while attempting to report contention.
907873
// Make a note of its magnitude, but don't allow it to be the sole cause
@@ -910,13 +876,6 @@ func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
910876
return
911877
}
912878

913-
if uintptr(unsafe.Pointer(l)) == prof.pending {
914-
// Optimization: we'd already planned to profile this same lock (though
915-
// possibly from a different unlock site).
916-
prof.cycles += cycles
917-
return
918-
}
919-
920879
if prev := prof.cycles; prev > 0 {
921880
// We can only store one call stack for runtime-internal lock contention
922881
// on this M, and we've already got one. Decide which should stay, and
@@ -930,24 +889,8 @@ func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
930889
prof.cyclesLost += prev
931890
}
932891
}
933-
// Saving the *mutex as a uintptr is safe because:
934-
// - lockrank_on.go does this too, which gives it regular exercise
935-
// - the lock would only move if it's stack allocated, which means it
936-
// cannot experience multi-M contention
937-
prof.pending = uintptr(unsafe.Pointer(l))
938892
prof.cycles = cycles
939-
}
940-
941-
// From unlock2, we might not be holding a p in this code.
942-
//
943-
//go:nowritebarrierrec
944-
func (prof *mLockProfile) recordUnlock(l *mutex) {
945-
if uintptr(unsafe.Pointer(l)) == prof.pending {
946-
prof.captureStack()
947-
}
948-
if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
949-
prof.store()
950-
}
893+
prof.captureStack()
951894
}
952895

953896
func (prof *mLockProfile) captureStack() {
@@ -957,7 +900,7 @@ func (prof *mLockProfile) captureStack() {
957900
return
958901
}
959902

960-
skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
903+
skip := 4 // runtime.(*mLockProfile).proposeUnlock runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
961904
if staticLockRanking {
962905
// When static lock ranking is enabled, we'll always be on the system
963906
// stack at this point. There will be a runtime.unlockWithRank.func1
@@ -970,7 +913,6 @@ func (prof *mLockProfile) captureStack() {
970913
// "runtime.unlock".
971914
skip += 1 // runtime.unlockWithRank.func1
972915
}
973-
prof.pending = 0
974916

975917
prof.stack[0] = logicalStackSentinel
976918
if debug.runtimeContentionStacks.Load() == 0 {

0 commit comments

Comments
 (0)