golang
diff --git a/‎src/runtime/lock_spinbit.go
Lines changed: 107 additions & 12 deletions b/‎src/runtime/lock_spinbit.go
Lines changed: 107 additions & 12 deletions
@@ -86,7 +86,8 @@ func key8(p *uintptr) *uint8 {
 // forming a singly-linked list with the mutex's key field pointing to the head
 // of the list.
 type mWaitList struct {
-	next muintptr // next m waiting for lock
+	next       muintptr // next m waiting for lock
+	startTicks int64    // when this m started waiting for the current lock holder, in cputicks
 }
 
 // lockVerifyMSize confirms that we can recreate the low bits of the M pointer.
@@ -170,16 +171,15 @@ func lock2(l *mutex) {
 	}
 	semacreate(gp.m)
 
-	timer := &lockTimer{lock: l}
-	timer.begin()
+	var startTime int64
 	// On uniprocessors, no point spinning.
 	// On multiprocessors, spin for mutexActiveSpinCount attempts.
 	spin := 0
 	if ncpu > 1 {
 		spin = mutexActiveSpinCount
 	}
 
-	var weSpin, atTail bool
+	var weSpin, atTail, haveTimers bool
 	v := atomic.Loaduintptr(&l.key)
 tryAcquire:
 	for i := 0; ; i++ {
@@ -192,13 +192,13 @@ tryAcquire:
 					next = next &^ mutexSleeping
 				}
 				if atomic.Casuintptr(&l.key, v, next) {
-					timer.end()
+					gp.m.mLockProfile.end(startTime)
 					return
 				}
 			} else {
 				prev8 := atomic.Xchg8(k8, mutexLocked|mutexSleeping)
 				if prev8&mutexLocked == 0 {
-					timer.end()
+					gp.m.mLockProfile.end(startTime)
 					return
 				}
 			}
@@ -228,6 +228,13 @@ tryAcquire:
 			throw("runtime·lock: sleeping while lock is available")
 		}
 
+		// Collect times for mutex profile (seen in unlock2 only via mWaitList),
+		// and for "/sync/mutex/wait/total:seconds" metric (to match).
+		if !haveTimers {
+			gp.m.mWaitList.startTicks = cputicks()
+			startTime = gp.m.mLockProfile.start()
+			haveTimers = true
+		}
 		// Store the current head of the list of sleeping Ms in our gp.m.mWaitList.next field
 		gp.m.mWaitList.next = mutexWaitListHead(v)
 
@@ -260,16 +267,54 @@ func unlock(l *mutex) {
 func unlock2(l *mutex) {
 	gp := getg()
 
-	prev8 := atomic.Xchg8(key8(&l.key), 0)
+	var prev8 uint8
+	var haveStackLock bool
+	var endTicks int64
+	if !mutexSampleContention() {
+		// Not collecting a sample for the contention profile, do the quick release
+		prev8 = atomic.Xchg8(key8(&l.key), 0)
+	} else {
+		// If there's contention, we'll sample it. Don't allow another
+		// lock2/unlock2 pair to finish before us and take our blame. Prevent
+		// that by trading for the stack lock with a CAS.
+		v := atomic.Loaduintptr(&l.key)
+		for {
+			if v&^mutexMMask == 0 || v&mutexStackLocked != 0 {
+				// No contention, or (stack lock unavailable) no way to calculate it
+				prev8 = atomic.Xchg8(key8(&l.key), 0)
+				endTicks = 0
+				break
+			}
+
+			// There's contention, the stack lock appeared to be available, and
+			// we'd like to collect a sample for the contention profile.
+			if endTicks == 0 {
+				// Read the time before releasing the lock. The profile will be
+				// strictly smaller than what other threads would see by timing
+				// their lock calls.
+				endTicks = cputicks()
+			}
+			next := (v | mutexStackLocked) &^ (mutexLocked | mutexSleeping)
+			if atomic.Casuintptr(&l.key, v, next) {
+				haveStackLock = true
+				prev8 = uint8(v)
+				// The fast path of lock2 may have cleared mutexSleeping.
+				// Restore it so we're sure to call unlock2Wake below.
+				prev8 |= mutexSleeping
+				break
+			}
+			v = atomic.Loaduintptr(&l.key)
+		}
+	}
 	if prev8&mutexLocked == 0 {
 		throw("unlock of unlocked lock")
 	}
 
 	if prev8&mutexSleeping != 0 {
-		unlock2Wake(l)
+		unlock2Wake(l, haveStackLock, endTicks)
 	}
 
-	gp.m.mLockProfile.recordUnlock(l)
+	gp.m.mLockProfile.store()
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtime·unlock: lock count")
@@ -279,15 +324,35 @@ func unlock2(l *mutex) {
 	}
 }
 
+// mutexSampleContention returns whether the current mutex operation should
+// report any contention it discovers.
+func mutexSampleContention() bool {
+	if rate := int64(atomic.Load64(&mutexprofilerate)); rate <= 0 {
+		return false
+	} else {
+		// TODO: have SetMutexProfileFraction do the clamping
+		rate32 := uint32(rate)
+		if int64(rate32) != rate {
+			rate32 = ^uint32(0)
+		}
+		return cheaprandn(rate32) == 0
+	}
+}
+
 // unlock2Wake updates the list of Ms waiting on l, waking an M if necessary.
 //
 //go:nowritebarrier
-func unlock2Wake(l *mutex) {
+func unlock2Wake(l *mutex, haveStackLock bool, endTicks int64) {
 	v := atomic.Loaduintptr(&l.key)
 
 	// On occasion, seek out and wake the M at the bottom of the stack so it
 	// doesn't starve.
 	antiStarve := cheaprandn(mutexTailWakePeriod) == 0
+
+	if haveStackLock {
+		goto useStackLock
+	}
+
 	if !(antiStarve || // avoiding starvation may require a wake
 		v&mutexSpinning == 0 || // no spinners means we must wake
 		mutexPreferLowLatency(l)) { // prefer waiters be awake as much as possible
@@ -324,6 +389,30 @@ func unlock2Wake(l *mutex) {
 	// We own the mutexStackLocked flag. New Ms may push themselves onto the
 	// stack concurrently, but we're now the only thread that can remove or
 	// modify the Ms that are sleeping in the list.
+useStackLock:
+
+	if endTicks != 0 {
+		// Find the M at the bottom of the stack of waiters, which has been
+		// asleep for the longest. Take the average of its wait time and the
+		// head M's wait time for the mutex contention profile, matching the
+		// estimate we do in semrelease1 (for sync.Mutex contention).
+		//
+		// We don't keep track of the tail node (we don't need it often), so do
+		// an O(N) walk on the list of sleeping Ms to find it.
+		head := mutexWaitListHead(v).ptr()
+		for node, n := head, 0; ; {
+			n++
+			next := node.mWaitList.next.ptr()
+			if next == nil {
+				cycles := endTicks - (head.mWaitList.startTicks+node.mWaitList.startTicks)/2
+				node.mWaitList.startTicks = endTicks
+				head.mWaitList.startTicks = endTicks
+				getg().m.mLockProfile.recordUnlock(cycles * int64(n))
+				break
+			}
+			node = next
+		}
+	}
 
 	var committed *m // If we choose an M within the stack, we've made a promise to wake it
 	for {
@@ -349,8 +438,14 @@ func unlock2Wake(l *mutex) {
 					prev, wakem = wakem, next
 				}
 				if wakem != mp {
-					prev.mWaitList.next = wakem.mWaitList.next
 					committed = wakem
+					prev.mWaitList.next = wakem.mWaitList.next
+					// An M sets its own startTicks when it first goes to sleep.
+					// When an unlock operation is sampled for the mutex
+					// contention profile, it takes blame for the entire list of
+					// waiting Ms but only updates the startTicks value at the
+					// tail. Copy any updates to the next-oldest M.
+					prev.mWaitList.startTicks = wakem.mWaitList.startTicks
 				}
 			}
 		}
@@ -365,7 +460,7 @@ func unlock2Wake(l *mutex) {
 				// Claimed an M. Wake it.
 				semawakeup(wakem)
 			}
-			break
+			return
 		}
 
 		v = atomic.Loaduintptr(&l.key)