Skip to content

Commit 7e97e4e

Browse files
AndrewGMorganprattmic
authored andcommitted
syscall: syscall.AllThreadsSyscall signal handling fixes
The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #44193 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt <[email protected]> Trust: Michael Pratt <[email protected]> Trust: Ian Lance Taylor <[email protected]> Run-TryBot: Michael Pratt <[email protected]> TryBot-Result: Go Bot <[email protected]>
1 parent 54af9fd commit 7e97e4e

File tree

4 files changed

+96
-7
lines changed

4 files changed

+96
-7
lines changed

src/os/signal/signal_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"os"
1717
"os/exec"
1818
"runtime"
19+
"runtime/trace"
1920
"strconv"
2021
"sync"
2122
"syscall"
@@ -854,3 +855,44 @@ func TestNotifyContextStringer(t *testing.T) {
854855
t.Errorf("c.String() = %q, want %q", got, want)
855856
}
856857
}
858+
859+
// #44193 test signal handling while stopping and starting the world.
860+
func TestSignalTrace(t *testing.T) {
861+
done := make(chan struct{})
862+
quit := make(chan struct{})
863+
c := make(chan os.Signal, 1)
864+
Notify(c, syscall.SIGHUP)
865+
866+
// Source and sink for signals busy loop unsynchronized with
867+
// trace starts and stops. We are ultimately validating that
868+
// signals and runtime.(stop|start)TheWorldGC are compatible.
869+
go func() {
870+
defer close(done)
871+
defer Stop(c)
872+
pid := syscall.Getpid()
873+
for {
874+
select {
875+
case <-quit:
876+
return
877+
default:
878+
syscall.Kill(pid, syscall.SIGHUP)
879+
}
880+
waitSig(t, c, syscall.SIGHUP)
881+
}
882+
}()
883+
884+
for i := 0; i < 100; i++ {
885+
buf := new(bytes.Buffer)
886+
if err := trace.Start(buf); err != nil {
887+
t.Fatalf("[%d] failed to start tracing: %v", i, err)
888+
}
889+
time.After(1 * time.Microsecond)
890+
trace.Stop()
891+
size := buf.Len()
892+
if size == 0 {
893+
t.Fatalf("[%d] trace is empty", i)
894+
}
895+
}
896+
close(quit)
897+
<-done
898+
}

src/runtime/proc.go

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,6 +1402,9 @@ func mPark() {
14021402
g := getg()
14031403
for {
14041404
notesleep(&g.m.park)
1405+
// Note, because of signal handling by this parked m,
1406+
// a preemptive mDoFixup() may actually occur via
1407+
// mDoFixupAndOSYield(). (See golang.org/issue/44193)
14051408
noteclear(&g.m.park)
14061409
if !mDoFixup() {
14071410
return
@@ -1635,6 +1638,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
16351638
for atomic.Load(&sched.sysmonStarting) != 0 {
16361639
osyield()
16371640
}
1641+
1642+
// We don't want this thread to handle signals for the
1643+
// duration of this critical section. The underlying issue
1644+
// being that this locked coordinating m is the one monitoring
1645+
// for fn() execution by all the other m's of the runtime,
1646+
// while no regular go code execution is permitted (the world
1647+
// is stopped). If this present m were to get distracted to
1648+
// run signal handling code, and find itself waiting for a
1649+
// second thread to execute go code before being able to
1650+
// return from that signal handling, a deadlock will result.
1651+
// (See golang.org/issue/44193.)
1652+
lockOSThread()
1653+
var sigmask sigset
1654+
sigsave(&sigmask)
1655+
sigblock(false)
1656+
16381657
stopTheWorldGC("doAllThreadsSyscall")
16391658
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
16401659
// Ensure that there are no in-flight thread
@@ -1686,6 +1705,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
16861705
// the possibility of racing with mp.
16871706
lock(&mp.mFixup.lock)
16881707
mp.mFixup.fn = fn
1708+
atomic.Store(&mp.mFixup.used, 1)
16891709
if mp.doesPark {
16901710
// For non-service threads this will
16911711
// cause the wakeup to be short lived
@@ -1702,9 +1722,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
17021722
if mp.procid == tid {
17031723
continue
17041724
}
1705-
lock(&mp.mFixup.lock)
1706-
done = done && (mp.mFixup.fn == nil)
1707-
unlock(&mp.mFixup.lock)
1725+
done = atomic.Load(&mp.mFixup.used) == 0
17081726
}
17091727
if done {
17101728
break
@@ -1731,6 +1749,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
17311749
unlock(&mFixupRace.lock)
17321750
}
17331751
startTheWorldGC()
1752+
msigrestore(sigmask)
1753+
unlockOSThread()
17341754
}
17351755

17361756
// runSafePointFn runs the safe point function, if any, for this P.
@@ -2225,9 +2245,21 @@ var mFixupRace struct {
22252245
// mDoFixup runs any outstanding fixup function for the running m.
22262246
// Returns true if a fixup was outstanding and actually executed.
22272247
//
2248+
// Note: to avoid deadlocks, and the need for the fixup function
2249+
// itself to be async safe, signals are blocked for the working m
2250+
// while it holds the mFixup lock. (See golang.org/issue/44193)
2251+
//
22282252
//go:nosplit
22292253
func mDoFixup() bool {
22302254
_g_ := getg()
2255+
if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
2256+
return false
2257+
}
2258+
2259+
// slow path - if fixup fn is used, block signals and lock.
2260+
var sigmask sigset
2261+
sigsave(&sigmask)
2262+
sigblock(false)
22312263
lock(&_g_.m.mFixup.lock)
22322264
fn := _g_.m.mFixup.fn
22332265
if fn != nil {
@@ -2244,7 +2276,6 @@ func mDoFixup() bool {
22442276
// is more obviously safe.
22452277
throw("GC must be disabled to protect validity of fn value")
22462278
}
2247-
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
22482279
if _g_.racectx != 0 || !raceenabled {
22492280
fn(false)
22502281
} else {
@@ -2259,11 +2290,24 @@ func mDoFixup() bool {
22592290
_g_.racectx = 0
22602291
unlock(&mFixupRace.lock)
22612292
}
2293+
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
2294+
atomic.Store(&_g_.m.mFixup.used, 0)
22622295
}
22632296
unlock(&_g_.m.mFixup.lock)
2297+
msigrestore(sigmask)
22642298
return fn != nil
22652299
}
22662300

2301+
// mDoFixupAndOSYield is called when an m is unable to send a signal
2302+
// because the allThreadsSyscall mechanism is in progress. That is, an
2303+
// mPark() has been interrupted with this signal handler so we need to
2304+
// ensure the fixup is executed from this context.
2305+
//go:nosplit
2306+
func mDoFixupAndOSYield() {
2307+
mDoFixup()
2308+
osyield()
2309+
}
2310+
22672311
// templateThread is a thread in a known-good state that exists solely
22682312
// to start new threads in known-good states when the calling thread
22692313
// may not be in a good state.

src/runtime/runtime2.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,10 +554,13 @@ type m struct {
554554
syscalltick uint32
555555
freelink *m // on sched.freem
556556

557-
// mFixup is used to synchronize OS related m state (credentials etc)
558-
// use mutex to access.
557+
// mFixup is used to synchronize OS related m state
558+
// (credentials etc) use mutex to access. To avoid deadlocks
559+
// an atomic.Load() of used being zero in mDoFixupFn()
560+
// guarantees fn is nil.
559561
mFixup struct {
560562
lock mutex
563+
used uint32
561564
fn func(bool) bool
562565
}
563566

src/runtime/sigqueue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ Send:
120120
}
121121
case sigFixup:
122122
// nothing to do - we need to wait for sigIdle.
123-
osyield()
123+
mDoFixupAndOSYield()
124124
}
125125
}
126126

0 commit comments

Comments
 (0)