Skip to content

Commit 5dead59

Browse files
rhyshgopherbot
authored andcommitted
Revert "runtime: profile mutex contention during unlock"
This reverts commit ba1c5b2 (CL 585638). Reason for revert: This is part of a patch series that changed the handling of contended lock2/unlock2 calls, reducing the maximum throughput of contended runtime.mutex values, and causing a performance regression on applications where that is (or became) the bottleneck. Updates #66999 Updates #67585 Change-Id: Ibeec5d8deb17e87966cf352fefc7efe2267839d6 Reviewed-on: https://go-review.googlesource.com/c/go/+/589115 Auto-Submit: Rhys Hiltner <[email protected]> Reviewed-by: Michael Pratt <[email protected]> Reviewed-by: Than McIntosh <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent ca7d300 commit 5dead59

File tree

3 files changed

+128
-80
lines changed

3 files changed

+128
-80
lines changed

src/runtime/lock_futex.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ func lock2(l *mutex) {
7272
return
7373
}
7474

75+
timer := &lockTimer{lock: l}
76+
timer.begin()
77+
7578
// If a goroutine's stack needed to grow during a lock2 call, the M could
7679
// end up with two active lock2 calls (one each on curg and g0). If both are
7780
// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -80,7 +83,6 @@ func lock2(l *mutex) {
8083
gp.stackguard0, gp.throwsplit = stackPreempt, true
8184
}
8285

83-
gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
8486
// On uniprocessors, no point spinning.
8587
// On multiprocessors, spin for ACTIVE_SPIN attempts.
8688
spin := 0
@@ -112,14 +114,14 @@ Loop:
112114

113115
if v == old || atomic.Casuintptr(&l.key, old, v) {
114116
gp.m.mWaitList.clearLinks()
115-
gp.m.mWaitList.acquireTimes = timePair{}
116117
break
117118
}
118119
v = atomic.Loaduintptr(&l.key)
119120
}
120121
if gp == gp.m.curg {
121122
gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
122123
}
124+
timer.end()
123125
return
124126
}
125127
i = 0
@@ -161,7 +163,6 @@ func unlock(l *mutex) {
161163
}
162164

163165
func unlock2(l *mutex) {
164-
now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
165166
for {
166167
v := atomic.Loaduintptr(&l.key)
167168
if v == mutex_locked {
@@ -171,12 +172,6 @@ func unlock2(l *mutex) {
171172
} else if v&mutex_locked == 0 {
172173
throw("unlock of unlocked lock")
173174
} else {
174-
if now != (timePair{}) {
175-
head := muintptr(v &^ (mutex_sleeping | mutex_locked))
176-
dt = claimMutexWaitTime(now, head)
177-
now = timePair{}
178-
}
179-
180175
// Other M's are waiting for the lock.
181176
if atomic.Casuintptr(&l.key, v, v&^mutex_locked) {
182177
futexwakeup(key32(&l.key), 1)
@@ -186,7 +181,7 @@ func unlock2(l *mutex) {
186181
}
187182

188183
gp := getg()
189-
gp.m.mLockProfile.recordUnlock(dt)
184+
gp.m.mLockProfile.recordUnlock(l)
190185
gp.m.locks--
191186
if gp.m.locks < 0 {
192187
throw("runtime·unlock: lock count")

src/runtime/lock_sema.go

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ func lock2(l *mutex) {
5252
}
5353
semacreate(gp.m)
5454

55+
timer := &lockTimer{lock: l}
56+
timer.begin()
57+
5558
// If a goroutine's stack needed to grow during a lock2 call, the M could
5659
// end up with two active lock2 calls (one each on curg and g0). If both are
5760
// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -60,7 +63,6 @@ func lock2(l *mutex) {
6063
gp.stackguard0, gp.throwsplit = stackPreempt, true
6164
}
6265

63-
gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
6466
// On uniprocessor's, no point spinning.
6567
// On multiprocessors, spin for ACTIVE_SPIN attempts.
6668
spin := 0
@@ -88,14 +90,14 @@ Loop:
8890

8991
if v == old || atomic.Casuintptr(&l.key, old, v) {
9092
gp.m.mWaitList.clearLinks()
91-
gp.m.mWaitList.acquireTimes = timePair{}
9293
break
9394
}
9495
v = atomic.Loaduintptr(&l.key)
9596
}
9697
if gp == gp.m.curg {
9798
gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
9899
}
100+
timer.end()
99101
return
100102
}
101103
i = 0
@@ -145,7 +147,6 @@ func unlock(l *mutex) {
145147
//
146148
//go:nowritebarrier
147149
func unlock2(l *mutex) {
148-
now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
149150
gp := getg()
150151
var mp *m
151152
for {
@@ -155,11 +156,6 @@ func unlock2(l *mutex) {
155156
break
156157
}
157158
} else {
158-
if now != (timePair{}) {
159-
dt = claimMutexWaitTime(now, muintptr(v&^locked))
160-
now = timePair{}
161-
}
162-
163159
// Other M's are waiting for the lock.
164160
// Dequeue an M.
165161
mp = muintptr(v &^ locked).ptr()
@@ -170,8 +166,7 @@ func unlock2(l *mutex) {
170166
}
171167
}
172168
}
173-
174-
gp.m.mLockProfile.recordUnlock(dt)
169+
gp.m.mLockProfile.recordUnlock(l)
175170
gp.m.locks--
176171
if gp.m.locks < 0 {
177172
throw("runtime·unlock: lock count")

src/runtime/mprof.go

Lines changed: 118 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -781,34 +781,99 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
781781
return head
782782
}
783783

784-
// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at
785-
// head to now, returning an estimate of the total wait time claimed by that
786-
// action.
787-
func claimMutexWaitTime(now timePair, head muintptr) timePair {
788-
fixMutexWaitList(head)
789-
hp := head.ptr()
790-
if hp == nil {
791-
return timePair{}
784+
// lockTimer assists with profiling contention on runtime-internal locks.
785+
//
786+
// There are several steps between the time that an M experiences contention and
787+
// when that contention may be added to the profile. This comes from our
788+
// constraints: We need to keep the critical section of each lock small,
789+
// especially when those locks are contended. The reporting code cannot acquire
790+
// new locks until the M has released all other locks, which means no memory
791+
// allocations and encourages use of (temporary) M-local storage.
792+
//
793+
// The M will have space for storing one call stack that caused contention, and
794+
// for the magnitude of that contention. It will also have space to store the
795+
// magnitude of additional contention the M caused, since it only has space to
796+
// remember one call stack and might encounter several contention events before
797+
// it releases all of its locks and is thus able to transfer the local buffer
798+
// into the profile.
799+
//
800+
// The M will collect the call stack when it unlocks the contended lock. That
801+
// minimizes the impact on the critical section of the contended lock, and
802+
// matches the mutex profile's behavior for contention in sync.Mutex: measured
803+
// at the Unlock method.
804+
//
805+
// The profile for contention on sync.Mutex blames the caller of Unlock for the
806+
// amount of contention experienced by the callers of Lock which had to wait.
807+
// When there are several critical sections, this allows identifying which of
808+
// them is responsible.
809+
//
810+
// Matching that behavior for runtime-internal locks will require identifying
811+
// which Ms are blocked on the mutex. The semaphore-based implementation is
812+
// ready to allow that, but the futex-based implementation will require a bit
813+
// more work. Until then, we report contention on runtime-internal locks with a
814+
// call stack taken from the unlock call (like the rest of the user-space
815+
// "mutex" profile), but assign it a duration value based on how long the
816+
// previous lock call took (like the user-space "block" profile).
817+
//
818+
// Thus, reporting the call stacks of runtime-internal lock contention is
819+
// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
820+
//
821+
// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
822+
//
823+
// The M will track this by storing a pointer to the lock; lock/unlock pairs for
824+
// runtime-internal locks are always on the same M.
825+
//
826+
// Together, that demands several steps for recording contention. First, when
827+
// finally acquiring a contended lock, the M decides whether it should plan to
828+
// profile that event by storing a pointer to the lock in its "to be profiled
829+
// upon unlock" field. If that field is already set, it uses the relative
830+
// magnitudes to weight a random choice between itself and the other lock, with
831+
// the loser's time being added to the "additional contention" field. Otherwise
832+
// if the M's call stack buffer is occupied, it does the comparison against that
833+
// sample's magnitude.
834+
//
835+
// Second, having unlocked a mutex the M checks to see if it should capture the
836+
// call stack into its local buffer. Finally, when the M unlocks its last mutex,
837+
// it transfers the local buffer into the profile. As part of that step, it also
838+
// transfers any "additional contention" time to the profile. Any lock
839+
// contention that it experiences while adding samples to the profile will be
840+
// recorded later as "additional contention" and not include a call stack, to
841+
// avoid an echo.
842+
type lockTimer struct {
843+
lock *mutex
844+
timeRate int64
845+
timeStart int64
846+
tickStart int64
847+
}
848+
849+
func (lt *lockTimer) begin() {
850+
rate := int64(atomic.Load64(&mutexprofilerate))
851+
852+
lt.timeRate = gTrackingPeriod
853+
if rate != 0 && rate < lt.timeRate {
854+
lt.timeRate = rate
855+
}
856+
if int64(cheaprand())%lt.timeRate == 0 {
857+
lt.timeStart = nanotime()
792858
}
793-
tp := hp.mWaitList.tail.ptr()
794-
waiters := hp.mWaitList.waiters
795-
headTimes := hp.mWaitList.acquireTimes
796-
tailTimes := tp.mWaitList.acquireTimes
797859

798-
var dt timePair
799-
dt.nanotime = now.nanotime - headTimes.nanotime
800-
dt.cputicks = now.cputicks - headTimes.cputicks
801-
if waiters > 1 {
802-
dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2
803-
dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2
860+
if rate > 0 && int64(cheaprand())%rate == 0 {
861+
lt.tickStart = cputicks()
804862
}
863+
}
805864

806-
// When removeMutexWaitList removes a head or tail node, it's responsible
807-
// for applying these changes to the new head or tail.
808-
hp.mWaitList.acquireTimes = now
809-
tp.mWaitList.acquireTimes = now
865+
func (lt *lockTimer) end() {
866+
gp := getg()
810867

811-
return dt
868+
if lt.timeStart != 0 {
869+
nowTime := nanotime()
870+
gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
871+
}
872+
873+
if lt.tickStart != 0 {
874+
nowTick := cputicks()
875+
gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
876+
}
812877
}
813878

814879
// mLockProfile is part of the M struct to hold information relating to mutex
@@ -825,49 +890,18 @@ func claimMutexWaitTime(now timePair, head muintptr) timePair {
825890
// not include a call stack, to avoid an echo.
826891
type mLockProfile struct {
827892
waitTime atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
828-
stack []uintptr // unlock stack that caused delay in other Ms' runtime.lockWithRank
829-
cycles int64 // cycles attributable to "stack"
893+
stack []uintptr // stack that experienced contention in runtime.lockWithRank
894+
pending uintptr // *mutex that experienced contention (to be traceback-ed)
895+
cycles int64 // cycles attributable to "pending" (if set), otherwise to "stack"
830896
cyclesLost int64 // contention for which we weren't able to record a call stack
831897
disabled bool // attribute all time to "lost"
832898
}
833899

834-
// recordUnlock considers the current unlock call (which caused a total of dt
835-
// delay in other Ms) for later inclusion in the mutex contention profile. If
836-
// this M holds no other locks, it transfers the buffered contention record to
837-
// the mutex contention profile.
838-
//
839-
// From unlock2, we might not be holding a p in this code.
840-
//
841-
//go:nowritebarrierrec
842-
func (prof *mLockProfile) recordUnlock(dt timePair) {
843-
if dt != (timePair{}) {
844-
// We could make a point of clearing out the local storage right before
845-
// this, to have a slightly better chance of being able to see the call
846-
// stack if the program has several (nested) contended locks. If apps
847-
// are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll
848-
// be a worthwhile change.
849-
prof.proposeUnlock(dt)
850-
}
851-
if getg().m.locks == 1 && prof.cycles != 0 {
852-
prof.store()
853-
}
854-
}
855-
856-
func (prof *mLockProfile) proposeUnlock(dt timePair) {
857-
if nanos := dt.nanotime; nanos > 0 {
858-
prof.waitTime.Add(nanos)
859-
}
860-
861-
cycles := dt.cputicks
900+
func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
862901
if cycles <= 0 {
863902
return
864903
}
865904

866-
rate := int64(atomic.Load64(&mutexprofilerate))
867-
if rate <= 0 || int64(cheaprand())%rate != 0 {
868-
return
869-
}
870-
871905
if prof.disabled {
872906
// We're experiencing contention while attempting to report contention.
873907
// Make a note of its magnitude, but don't allow it to be the sole cause
@@ -876,6 +910,13 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
876910
return
877911
}
878912

913+
if uintptr(unsafe.Pointer(l)) == prof.pending {
914+
// Optimization: we'd already planned to profile this same lock (though
915+
// possibly from a different unlock site).
916+
prof.cycles += cycles
917+
return
918+
}
919+
879920
if prev := prof.cycles; prev > 0 {
880921
// We can only store one call stack for runtime-internal lock contention
881922
// on this M, and we've already got one. Decide which should stay, and
@@ -889,8 +930,24 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
889930
prof.cyclesLost += prev
890931
}
891932
}
933+
// Saving the *mutex as a uintptr is safe because:
934+
// - lockrank_on.go does this too, which gives it regular exercise
935+
// - the lock would only move if it's stack allocated, which means it
936+
// cannot experience multi-M contention
937+
prof.pending = uintptr(unsafe.Pointer(l))
892938
prof.cycles = cycles
893-
prof.captureStack()
939+
}
940+
941+
// From unlock2, we might not be holding a p in this code.
942+
//
943+
//go:nowritebarrierrec
944+
func (prof *mLockProfile) recordUnlock(l *mutex) {
945+
if uintptr(unsafe.Pointer(l)) == prof.pending {
946+
prof.captureStack()
947+
}
948+
if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
949+
prof.store()
950+
}
894951
}
895952

896953
func (prof *mLockProfile) captureStack() {
@@ -900,7 +957,7 @@ func (prof *mLockProfile) captureStack() {
900957
return
901958
}
902959

903-
skip := 4 // runtime.(*mLockProfile).proposeUnlock runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
960+
skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
904961
if staticLockRanking {
905962
// When static lock ranking is enabled, we'll always be on the system
906963
// stack at this point. There will be a runtime.unlockWithRank.func1
@@ -913,6 +970,7 @@ func (prof *mLockProfile) captureStack() {
913970
// "runtime.unlock".
914971
skip += 1 // runtime.unlockWithRank.func1
915972
}
973+
prof.pending = 0
916974

917975
prof.stack[0] = logicalStackSentinel
918976
if debug.runtimeContentionStacks.Load() == 0 {

0 commit comments

Comments
 (0)