@@ -781,34 +781,99 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
781
781
return head
782
782
}
783
783
784
- // claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at
785
- // head to now, returning an estimate of the total wait time claimed by that
786
- // action.
787
- func claimMutexWaitTime (now timePair , head muintptr ) timePair {
788
- fixMutexWaitList (head )
789
- hp := head .ptr ()
790
- if hp == nil {
791
- return timePair {}
784
+ // lockTimer assists with profiling contention on runtime-internal locks.
785
+ //
786
+ // There are several steps between the time that an M experiences contention and
787
+ // when that contention may be added to the profile. This comes from our
788
+ // constraints: We need to keep the critical section of each lock small,
789
+ // especially when those locks are contended. The reporting code cannot acquire
790
+ // new locks until the M has released all other locks, which means no memory
791
+ // allocations and encourages use of (temporary) M-local storage.
792
+ //
793
+ // The M will have space for storing one call stack that caused contention, and
794
+ // for the magnitude of that contention. It will also have space to store the
795
+ // magnitude of additional contention the M caused, since it only has space to
796
+ // remember one call stack and might encounter several contention events before
797
+ // it releases all of its locks and is thus able to transfer the local buffer
798
+ // into the profile.
799
+ //
800
+ // The M will collect the call stack when it unlocks the contended lock. That
801
+ // minimizes the impact on the critical section of the contended lock, and
802
+ // matches the mutex profile's behavior for contention in sync.Mutex: measured
803
+ // at the Unlock method.
804
+ //
805
+ // The profile for contention on sync.Mutex blames the caller of Unlock for the
806
+ // amount of contention experienced by the callers of Lock which had to wait.
807
+ // When there are several critical sections, this allows identifying which of
808
+ // them is responsible.
809
+ //
810
+ // Matching that behavior for runtime-internal locks will require identifying
811
+ // which Ms are blocked on the mutex. The semaphore-based implementation is
812
+ // ready to allow that, but the futex-based implementation will require a bit
813
+ // more work. Until then, we report contention on runtime-internal locks with a
814
+ // call stack taken from the unlock call (like the rest of the user-space
815
+ // "mutex" profile), but assign it a duration value based on how long the
816
+ // previous lock call took (like the user-space "block" profile).
817
+ //
818
+ // Thus, reporting the call stacks of runtime-internal lock contention is
819
+ // guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
820
+ //
821
+ // TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
822
+ //
823
+ // The M will track this by storing a pointer to the lock; lock/unlock pairs for
824
+ // runtime-internal locks are always on the same M.
825
+ //
826
+ // Together, that demands several steps for recording contention. First, when
827
+ // finally acquiring a contended lock, the M decides whether it should plan to
828
+ // profile that event by storing a pointer to the lock in its "to be profiled
829
+ // upon unlock" field. If that field is already set, it uses the relative
830
+ // magnitudes to weight a random choice between itself and the other lock, with
831
+ // the loser's time being added to the "additional contention" field. Otherwise
832
+ // if the M's call stack buffer is occupied, it does the comparison against that
833
+ // sample's magnitude.
834
+ //
835
+ // Second, having unlocked a mutex the M checks to see if it should capture the
836
+ // call stack into its local buffer. Finally, when the M unlocks its last mutex,
837
+ // it transfers the local buffer into the profile. As part of that step, it also
838
+ // transfers any "additional contention" time to the profile. Any lock
839
+ // contention that it experiences while adding samples to the profile will be
840
+ // recorded later as "additional contention" and not include a call stack, to
841
+ // avoid an echo.
842
+ type lockTimer struct {
843
+ lock * mutex
844
+ timeRate int64
845
+ timeStart int64
846
+ tickStart int64
847
+ }
848
+
849
+ func (lt * lockTimer ) begin () {
850
+ rate := int64 (atomic .Load64 (& mutexprofilerate ))
851
+
852
+ lt .timeRate = gTrackingPeriod
853
+ if rate != 0 && rate < lt .timeRate {
854
+ lt .timeRate = rate
855
+ }
856
+ if int64 (cheaprand ())% lt .timeRate == 0 {
857
+ lt .timeStart = nanotime ()
792
858
}
793
- tp := hp .mWaitList .tail .ptr ()
794
- waiters := hp .mWaitList .waiters
795
- headTimes := hp .mWaitList .acquireTimes
796
- tailTimes := tp .mWaitList .acquireTimes
797
859
798
- var dt timePair
799
- dt .nanotime = now .nanotime - headTimes .nanotime
800
- dt .cputicks = now .cputicks - headTimes .cputicks
801
- if waiters > 1 {
802
- dt .nanotime = int64 (waiters ) * (dt .nanotime + now .nanotime - tailTimes .nanotime ) / 2
803
- dt .cputicks = int64 (waiters ) * (dt .cputicks + now .cputicks - tailTimes .cputicks ) / 2
860
+ if rate > 0 && int64 (cheaprand ())% rate == 0 {
861
+ lt .tickStart = cputicks ()
804
862
}
863
+ }
805
864
806
- // When removeMutexWaitList removes a head or tail node, it's responsible
807
- // for applying these changes to the new head or tail.
808
- hp .mWaitList .acquireTimes = now
809
- tp .mWaitList .acquireTimes = now
865
+ func (lt * lockTimer ) end () {
866
+ gp := getg ()
810
867
811
- return dt
868
+ if lt .timeStart != 0 {
869
+ nowTime := nanotime ()
870
+ gp .m .mLockProfile .waitTime .Add ((nowTime - lt .timeStart ) * lt .timeRate )
871
+ }
872
+
873
+ if lt .tickStart != 0 {
874
+ nowTick := cputicks ()
875
+ gp .m .mLockProfile .recordLock (nowTick - lt .tickStart , lt .lock )
876
+ }
812
877
}
813
878
814
879
// mLockProfile is part of the M struct to hold information relating to mutex
@@ -825,49 +890,18 @@ func claimMutexWaitTime(now timePair, head muintptr) timePair {
825
890
// not include a call stack, to avoid an echo.
826
891
type mLockProfile struct {
827
892
waitTime atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
828
- stack []uintptr // unlock stack that caused delay in other Ms' runtime.lockWithRank
829
- cycles int64 // cycles attributable to "stack"
893
+ stack []uintptr // stack that experienced contention in runtime.lockWithRank
894
+ pending uintptr // *mutex that experienced contention (to be traceback-ed)
895
+ cycles int64 // cycles attributable to "pending" (if set), otherwise to "stack"
830
896
cyclesLost int64 // contention for which we weren't able to record a call stack
831
897
disabled bool // attribute all time to "lost"
832
898
}
833
899
834
- // recordUnlock considers the current unlock call (which caused a total of dt
835
- // delay in other Ms) for later inclusion in the mutex contention profile. If
836
- // this M holds no other locks, it transfers the buffered contention record to
837
- // the mutex contention profile.
838
- //
839
- // From unlock2, we might not be holding a p in this code.
840
- //
841
- //go:nowritebarrierrec
842
- func (prof * mLockProfile ) recordUnlock (dt timePair ) {
843
- if dt != (timePair {}) {
844
- // We could make a point of clearing out the local storage right before
845
- // this, to have a slightly better chance of being able to see the call
846
- // stack if the program has several (nested) contended locks. If apps
847
- // are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll
848
- // be a worthwhile change.
849
- prof .proposeUnlock (dt )
850
- }
851
- if getg ().m .locks == 1 && prof .cycles != 0 {
852
- prof .store ()
853
- }
854
- }
855
-
856
- func (prof * mLockProfile ) proposeUnlock (dt timePair ) {
857
- if nanos := dt .nanotime ; nanos > 0 {
858
- prof .waitTime .Add (nanos )
859
- }
860
-
861
- cycles := dt .cputicks
900
+ func (prof * mLockProfile ) recordLock (cycles int64 , l * mutex ) {
862
901
if cycles <= 0 {
863
902
return
864
903
}
865
904
866
- rate := int64 (atomic .Load64 (& mutexprofilerate ))
867
- if rate <= 0 || int64 (cheaprand ())% rate != 0 {
868
- return
869
- }
870
-
871
905
if prof .disabled {
872
906
// We're experiencing contention while attempting to report contention.
873
907
// Make a note of its magnitude, but don't allow it to be the sole cause
@@ -876,6 +910,13 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
876
910
return
877
911
}
878
912
913
+ if uintptr (unsafe .Pointer (l )) == prof .pending {
914
+ // Optimization: we'd already planned to profile this same lock (though
915
+ // possibly from a different unlock site).
916
+ prof .cycles += cycles
917
+ return
918
+ }
919
+
879
920
if prev := prof .cycles ; prev > 0 {
880
921
// We can only store one call stack for runtime-internal lock contention
881
922
// on this M, and we've already got one. Decide which should stay, and
@@ -889,8 +930,24 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
889
930
prof .cyclesLost += prev
890
931
}
891
932
}
933
+ // Saving the *mutex as a uintptr is safe because:
934
+ // - lockrank_on.go does this too, which gives it regular exercise
935
+ // - the lock would only move if it's stack allocated, which means it
936
+ // cannot experience multi-M contention
937
+ prof .pending = uintptr (unsafe .Pointer (l ))
892
938
prof .cycles = cycles
893
- prof .captureStack ()
939
+ }
940
+
941
+ // From unlock2, we might not be holding a p in this code.
942
+ //
943
+ //go:nowritebarrierrec
944
+ func (prof * mLockProfile ) recordUnlock (l * mutex ) {
945
+ if uintptr (unsafe .Pointer (l )) == prof .pending {
946
+ prof .captureStack ()
947
+ }
948
+ if gp := getg (); gp .m .locks == 1 && gp .m .mLockProfile .cycles != 0 {
949
+ prof .store ()
950
+ }
894
951
}
895
952
896
953
func (prof * mLockProfile ) captureStack () {
@@ -900,7 +957,7 @@ func (prof *mLockProfile) captureStack() {
900
957
return
901
958
}
902
959
903
- skip := 4 // runtime.(*mLockProfile).proposeUnlock runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
960
+ skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
904
961
if staticLockRanking {
905
962
// When static lock ranking is enabled, we'll always be on the system
906
963
// stack at this point. There will be a runtime.unlockWithRank.func1
@@ -913,6 +970,7 @@ func (prof *mLockProfile) captureStack() {
913
970
// "runtime.unlock".
914
971
skip += 1 // runtime.unlockWithRank.func1
915
972
}
973
+ prof .pending = 0
916
974
917
975
prof .stack [0 ] = logicalStackSentinel
918
976
if debug .runtimeContentionStacks .Load () == 0 {
0 commit comments