Skip to content

Commit ff059ad

Browse files
lucoffeprattmic
authored andcommitted
runtime: resolve checkdead panic by refining startm lock handling in caller context
This change addresses a `checkdead` panic caused by a race condition between `sysmon->startm` and `checkdead` callers, due to prematurely releasing the scheduler lock. The solution involves allowing a `startm` caller to acquire the scheduler lock and call `startm` in this context. A new `lockheld` bool argument is added to `startm`, which manages all lock and unlock calls within the function. The`startIdle` function variable in `injectglist` is updated to call `startm` with the lock held, ensuring proper lock handling in this specific case. This refined lock handling resolves the observed race condition issue. Fixes #59600 Change-Id: I11663a15536c10c773fc2fde291d959099aa71be Reviewed-on: https://go-review.googlesource.com/c/go/+/487316 Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Michael Pratt <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Run-TryBot: Michael Pratt <[email protected]>
1 parent 752ef81 commit ff059ad

File tree

1 file changed

+31
-14
lines changed

1 file changed

+31
-14
lines changed

src/runtime/proc.go

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,10 +2435,15 @@ func mspinning() {
24352435
// Callers passing a non-nil P must call from a non-preemptible context. See
24362436
// comment on acquirem below.
24372437
//
2438+
// Argument lockheld indicates whether the caller already acquired the
2439+
// scheduler lock. Callers holding the lock when making the call must pass
2440+
// true. The lock might be temporarily dropped, but will be reacquired before
2441+
// returning.
2442+
//
24382443
// Must not have write barriers because this may be called without a P.
24392444
//
24402445
//go:nowritebarrierrec
2441-
func startm(pp *p, spinning bool) {
2446+
func startm(pp *p, spinning, lockheld bool) {
24422447
// Disable preemption.
24432448
//
24442449
// Every owned P must have an owner that will eventually stop it in the
@@ -2456,7 +2461,9 @@ func startm(pp *p, spinning bool) {
24562461
// startm. Callers passing a nil P may be preemptible, so we must
24572462
// disable preemption before acquiring a P from pidleget below.
24582463
mp := acquirem()
2459-
lock(&sched.lock)
2464+
if !lockheld {
2465+
lock(&sched.lock)
2466+
}
24602467
if pp == nil {
24612468
if spinning {
24622469
// TODO(prattmic): All remaining calls to this function
@@ -2466,7 +2473,9 @@ func startm(pp *p, spinning bool) {
24662473
}
24672474
pp, _ = pidleget(0)
24682475
if pp == nil {
2469-
unlock(&sched.lock)
2476+
if !lockheld {
2477+
unlock(&sched.lock)
2478+
}
24702479
releasem(mp)
24712480
return
24722481
}
@@ -2480,6 +2489,8 @@ func startm(pp *p, spinning bool) {
24802489
// could find no idle P while checkdead finds a runnable G but
24812490
// no running M's because this new M hasn't started yet, thus
24822491
// throwing in an apparent deadlock.
2492+
// This apparent deadlock is possible when startm is called
2493+
// from sysmon, which doesn't count as a running M.
24832494
//
24842495
// Avoid this situation by pre-allocating the ID for the new M,
24852496
// thus marking it as 'running' before we drop sched.lock. This
@@ -2494,12 +2505,18 @@ func startm(pp *p, spinning bool) {
24942505
fn = mspinning
24952506
}
24962507
newm(fn, pp, id)
2508+
2509+
if lockheld {
2510+
lock(&sched.lock)
2511+
}
24972512
// Ownership transfer of pp committed by start in newm.
24982513
// Preemption is now safe.
24992514
releasem(mp)
25002515
return
25012516
}
2502-
unlock(&sched.lock)
2517+
if !lockheld {
2518+
unlock(&sched.lock)
2519+
}
25032520
if nmp.spinning {
25042521
throw("startm: m is spinning")
25052522
}
@@ -2528,24 +2545,24 @@ func handoffp(pp *p) {
25282545

25292546
// if it has local work, start it straight away
25302547
if !runqempty(pp) || sched.runqsize != 0 {
2531-
startm(pp, false)
2548+
startm(pp, false, false)
25322549
return
25332550
}
25342551
// if there's trace work to do, start it straight away
25352552
if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
2536-
startm(pp, false)
2553+
startm(pp, false, false)
25372554
return
25382555
}
25392556
// if it has GC work, start it straight away
25402557
if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
2541-
startm(pp, false)
2558+
startm(pp, false, false)
25422559
return
25432560
}
25442561
// no local work, check that there are no spinning/idle M's,
25452562
// otherwise our help is not required
25462563
if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
25472564
sched.needspinning.Store(0)
2548-
startm(pp, true)
2565+
startm(pp, true, false)
25492566
return
25502567
}
25512568
lock(&sched.lock)
@@ -2567,14 +2584,14 @@ func handoffp(pp *p) {
25672584
}
25682585
if sched.runqsize != 0 {
25692586
unlock(&sched.lock)
2570-
startm(pp, false)
2587+
startm(pp, false, false)
25712588
return
25722589
}
25732590
// If this is the last running P and nobody is polling network,
25742591
// need to wakeup another M to poll network.
25752592
if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
25762593
unlock(&sched.lock)
2577-
startm(pp, false)
2594+
startm(pp, false, false)
25782595
return
25792596
}
25802597

@@ -2623,7 +2640,7 @@ func wakep() {
26232640
// see at least one running M (ours).
26242641
unlock(&sched.lock)
26252642

2626-
startm(pp, true)
2643+
startm(pp, true, false)
26272644

26282645
releasem(mp)
26292646
}
@@ -3376,8 +3393,8 @@ func injectglist(glist *gList) {
33763393
break
33773394
}
33783395

3396+
startm(pp, false, true)
33793397
unlock(&sched.lock)
3380-
startm(pp, false)
33813398
releasem(mp)
33823399
}
33833400
}
@@ -5484,7 +5501,7 @@ func sysmon() {
54845501
// See issue 42515 and
54855502
// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
54865503
if next := timeSleepUntil(); next < now {
5487-
startm(nil, false)
5504+
startm(nil, false, false)
54885505
}
54895506
}
54905507
if scavenger.sysmonWake.Load() != 0 {
@@ -5756,7 +5773,7 @@ func schedEnableUser(enable bool) {
57565773
globrunqputbatch(&sched.disable.runnable, n)
57575774
unlock(&sched.lock)
57585775
for ; n != 0 && sched.npidle.Load() != 0; n-- {
5759-
startm(nil, false)
5776+
startm(nil, false, false)
57605777
}
57615778
} else {
57625779
unlock(&sched.lock)

0 commit comments

Comments
 (0)