Revert "runtime: split mutex profile clocks"

rhysh · gopherbot · commit 1be701a344d1 · 2024-05-30T17:49:07.000Z
This reverts commit 8ab131f (CL 586796) Reason for revert: This is part of a patch series that changed the handling of contended lock2/unlock2 calls, reducing the maximum throughput of contended runtime.mutex values, and causing a performance regression on applications where that is (or became) the bottleneck. Updates #66999 Updates #67585 Change-Id: I54711691e86e072081482102019d168292b5150a Reviewed-on: https://go-review.googlesource.com/c/go/+/589095 Reviewed-by: Michael Pratt <mpratt@google.com> Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Than McIntosh <thanm@google.com>
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
@@ -80,14 +80,7 @@ func lock2(l *mutex) {
 		gp.stackguard0, gp.throwsplit = stackPreempt, true
 	}
 
-	var startNanos int64
-	const sampleRate = gTrackingPeriod
-	sample := cheaprandn(sampleRate) == 0
-	if sample {
-		startNanos = nanotime()
-	}
-	gp.m.mWaitList.acquireTicks = cputicks()
-
+	gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
 	// On uniprocessors, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@@ -119,19 +112,14 @@ Loop:
 
 					if v == old || atomic.Casuintptr(&l.key, old, v) {
 						gp.m.mWaitList.clearLinks()
-						gp.m.mWaitList.acquireTicks = 0
+						gp.m.mWaitList.acquireTimes = timePair{}
 						break
 					}
 					v = atomic.Loaduintptr(&l.key)
 				}
 				if gp == gp.m.curg {
 					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
 				}
-
-				if sample {
-					endNanos := nanotime()
-					gp.m.mLockProfile.waitTime.Add((endNanos - startNanos) * sampleRate)
-				}
 				return
 			}
 			i = 0
@@ -173,8 +161,7 @@ func unlock(l *mutex) {
 }
 
 func unlock2(l *mutex) {
-	var claimed bool
-	var cycles int64
+	now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
 	for {
 		v := atomic.Loaduintptr(&l.key)
 		if v == mutex_locked {
@@ -184,11 +171,10 @@ func unlock2(l *mutex) {
 		} else if v&mutex_locked == 0 {
 			throw("unlock of unlocked lock")
 		} else {
-			if !claimed {
-				claimed = true
-				nowTicks := cputicks()
+			if now != (timePair{}) {
 				head := muintptr(v &^ (mutex_sleeping | mutex_locked))
-				cycles = claimMutexWaitTime(nowTicks, head)
+				dt = claimMutexWaitTime(now, head)
+				now = timePair{}
 			}
 
 			// Other M's are waiting for the lock.
@@ -200,7 +186,7 @@ func unlock2(l *mutex) {
 	}
 
 	gp := getg()
-	gp.m.mLockProfile.recordUnlock(cycles)
+	gp.m.mLockProfile.recordUnlock(dt)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtime·unlock: lock count")
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
@@ -60,14 +60,7 @@ func lock2(l *mutex) {
 		gp.stackguard0, gp.throwsplit = stackPreempt, true
 	}
 
-	var startNanos int64
-	const sampleRate = gTrackingPeriod
-	sample := cheaprandn(sampleRate) == 0
-	if sample {
-		startNanos = nanotime()
-	}
-	gp.m.mWaitList.acquireTicks = cputicks()
-
+	gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@@ -95,19 +88,14 @@ Loop:
 
 					if v == old || atomic.Casuintptr(&l.key, old, v) {
 						gp.m.mWaitList.clearLinks()
-						gp.m.mWaitList.acquireTicks = 0
+						gp.m.mWaitList.acquireTimes = timePair{}
 						break
 					}
 					v = atomic.Loaduintptr(&l.key)
 				}
 				if gp == gp.m.curg {
 					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
 				}
-
-				if sample {
-					endNanos := nanotime()
-					gp.m.mLockProfile.waitTime.Add((endNanos - startNanos) * sampleRate)
-				}
 				return
 			}
 			i = 0
@@ -157,8 +145,7 @@ func unlock(l *mutex) {
 //
 //go:nowritebarrier
 func unlock2(l *mutex) {
-	var claimed bool
-	var cycles int64
+	now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
 	gp := getg()
 	var mp *m
 	for {
@@ -168,11 +155,9 @@ func unlock2(l *mutex) {
 				break
 			}
 		} else {
-			if !claimed {
-				claimed = true
-				nowTicks := cputicks()
-				head := muintptr(v &^ locked)
-				cycles = claimMutexWaitTime(nowTicks, head)
+			if now != (timePair{}) {
+				dt = claimMutexWaitTime(now, muintptr(v&^locked))
+				now = timePair{}
 			}
 
 			// Other M's are waiting for the lock.
@@ -186,7 +171,7 @@ func unlock2(l *mutex) {
 		}
 	}
 
-	gp.m.mLockProfile.recordUnlock(cycles)
+	gp.m.mLockProfile.recordUnlock(dt)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtime·unlock: lock count")
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
@@ -590,7 +590,7 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 // mutex.
 //
 // Having found the head and tail nodes and a correct waiters count, the
-// unlocking M can read and update those two nodes' acquireTicks field and thus
+// unlocking M can read and update those two nodes' acquireTimes fields and thus
 // take responsibility for (an estimate of) the entire list's delay since the
 // last unlock call.
 //
@@ -603,16 +603,21 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
 // runtime controls the order of thread wakeups (it's a LIFO stack), but with
 // lock_futex.go the OS can wake an arbitrary thread.
 type mWaitList struct {
-	acquireTicks int64    // start of current wait (set by us, updated by others during unlock)
+	acquireTimes timePair // start of current wait (set by us, updated by others during unlock)
 	next         muintptr // next m waiting for lock (set by us, cleared by another during unlock)
 	prev         muintptr // previous m waiting for lock (an amortized hint, set by another during unlock)
 	tail         muintptr // final m waiting for lock (an amortized hint, set by others during unlock)
 	waiters      int32    // length of waiting m list (an amortized hint, set by another during unlock)
 }
 
+type timePair struct {
+	nanotime int64
+	cputicks int64
+}
+
 // clearLinks resets the fields related to the M's position in the list of Ms
-// waiting for a mutex. It leaves acquireTicks intact, since this M may still be
-// waiting and may have had its acquireTicks updated by an unlock2 call.
+// waiting for a mutex. It leaves acquireTimes intact, since this M may still be
+// waiting and may have had its acquireTimes updated by an unlock2 call.
 //
 // In lock_sema.go, the previous owner of the mutex dequeues an M and then wakes
 // it; with semaphore-based sleep, it's important that each M receives only one
@@ -731,8 +736,8 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
 	hp := head.ptr()
 	tail := hp.mWaitList.tail
 	waiters := hp.mWaitList.waiters
-	headTicks := hp.mWaitList.acquireTicks
-	tailTicks := hp.mWaitList.tail.ptr().mWaitList.acquireTicks
+	headTimes := hp.mWaitList.acquireTimes
+	tailTimes := hp.mWaitList.tail.ptr().mWaitList.acquireTimes
 
 	mp.mWaitList.tail = 0
 
@@ -768,40 +773,42 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
 		hp.mWaitList.prev = 0
 		hp.mWaitList.tail = tail
 		hp.mWaitList.waiters = waiters - 1
-		hp.mWaitList.acquireTicks = headTicks
+		hp.mWaitList.acquireTimes = headTimes
 	}
 	if tp := tail.ptr(); tp != nil {
-		tp.mWaitList.acquireTicks = tailTicks
+		tp.mWaitList.acquireTimes = tailTimes
 	}
 	return head
 }
 
-// claimMutexWaitTime advances the acquireTicks of the list of waiting Ms at
+// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at
 // head to now, returning an estimate of the total wait time claimed by that
 // action.
-func claimMutexWaitTime(nowTicks int64, head muintptr) int64 {
+func claimMutexWaitTime(now timePair, head muintptr) timePair {
 	fixMutexWaitList(head)
 	hp := head.ptr()
 	if hp == nil {
-		return 0
+		return timePair{}
 	}
 	tp := hp.mWaitList.tail.ptr()
 	waiters := hp.mWaitList.waiters
-	headTicks := hp.mWaitList.acquireTicks
-	tailTicks := tp.mWaitList.acquireTicks
+	headTimes := hp.mWaitList.acquireTimes
+	tailTimes := tp.mWaitList.acquireTimes
 
-	var cycles int64
-	cycles = nowTicks - headTicks
+	var dt timePair
+	dt.nanotime = now.nanotime - headTimes.nanotime
+	dt.cputicks = now.cputicks - headTimes.cputicks
 	if waiters > 1 {
-		cycles = int64(waiters) * (cycles + nowTicks - tailTicks) / 2
+		dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2
+		dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2
 	}
 
 	// When removeMutexWaitList removes a head or tail node, it's responsible
 	// for applying these changes to the new head or tail.
-	hp.mWaitList.acquireTicks = nowTicks
-	tp.mWaitList.acquireTicks = nowTicks
+	hp.mWaitList.acquireTimes = now
+	tp.mWaitList.acquireTimes = now
 
-	return cycles
+	return dt
 }
 
 // mLockProfile is part of the M struct to hold information relating to mutex
@@ -832,21 +839,26 @@ type mLockProfile struct {
 // From unlock2, we might not be holding a p in this code.
 //
 //go:nowritebarrierrec
-func (prof *mLockProfile) recordUnlock(cycles int64) {
-	if cycles != 0 {
+func (prof *mLockProfile) recordUnlock(dt timePair) {
+	if dt != (timePair{}) {
 		// We could make a point of clearing out the local storage right before
 		// this, to have a slightly better chance of being able to see the call
 		// stack if the program has several (nested) contended locks. If apps
 		// are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll
 		// be a worthwhile change.
-		prof.proposeUnlock(cycles)
+		prof.proposeUnlock(dt)
 	}
 	if getg().m.locks == 1 && prof.cycles != 0 {
 		prof.store()
 	}
 }
 
-func (prof *mLockProfile) proposeUnlock(cycles int64) {
+func (prof *mLockProfile) proposeUnlock(dt timePair) {
+	if nanos := dt.nanotime; nanos > 0 {
+		prof.waitTime.Add(nanos)
+	}
+
+	cycles := dt.cputicks
 	if cycles <= 0 {
 		return
 	}