Skip to content

Commit ce04f86

Browse files
AndrewGMorganianlancetaylor
authored andcommitted
[release-branch.go1.16] syscall: syscall.AllThreadsSyscall signal handling fixes
The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #45307 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt <[email protected]> Trust: Michael Pratt <[email protected]> Trust: Ian Lance Taylor <[email protected]> Run-TryBot: Michael Pratt <[email protected]> TryBot-Result: Go Bot <[email protected]> (cherry picked from commit 7e97e4e) Reviewed-on: https://go-review.googlesource.com/c/go/+/316869 Run-TryBot: Ian Lance Taylor <[email protected]>
1 parent 7e70979 commit ce04f86

File tree

4 files changed

+96
-7
lines changed

4 files changed

+96
-7
lines changed

src/os/signal/signal_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"os"
1616
"os/exec"
1717
"runtime"
18+
"runtime/trace"
1819
"strconv"
1920
"sync"
2021
"syscall"
@@ -853,3 +854,44 @@ func TestNotifyContextStringer(t *testing.T) {
853854
t.Errorf("c.String() = %q, want %q", got, want)
854855
}
855856
}
857+
858+
// #44193 test signal handling while stopping and starting the world.
859+
func TestSignalTrace(t *testing.T) {
860+
done := make(chan struct{})
861+
quit := make(chan struct{})
862+
c := make(chan os.Signal, 1)
863+
Notify(c, syscall.SIGHUP)
864+
865+
// Source and sink for signals busy loop unsynchronized with
866+
// trace starts and stops. We are ultimately validating that
867+
// signals and runtime.(stop|start)TheWorldGC are compatible.
868+
go func() {
869+
defer close(done)
870+
defer Stop(c)
871+
pid := syscall.Getpid()
872+
for {
873+
select {
874+
case <-quit:
875+
return
876+
default:
877+
syscall.Kill(pid, syscall.SIGHUP)
878+
}
879+
waitSig(t, c, syscall.SIGHUP)
880+
}
881+
}()
882+
883+
for i := 0; i < 100; i++ {
884+
buf := new(bytes.Buffer)
885+
if err := trace.Start(buf); err != nil {
886+
t.Fatalf("[%d] failed to start tracing: %v", i, err)
887+
}
888+
time.After(1 * time.Microsecond)
889+
trace.Stop()
890+
size := buf.Len()
891+
if size == 0 {
892+
t.Fatalf("[%d] trace is empty", i)
893+
}
894+
}
895+
close(quit)
896+
<-done
897+
}

src/runtime/proc.go

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1338,6 +1338,9 @@ func mPark() {
13381338
g := getg()
13391339
for {
13401340
notesleep(&g.m.park)
1341+
// Note, because of signal handling by this parked m,
1342+
// a preemptive mDoFixup() may actually occur via
1343+
// mDoFixupAndOSYield(). (See golang.org/issue/44193)
13411344
noteclear(&g.m.park)
13421345
if !mDoFixup() {
13431346
return
@@ -1571,6 +1574,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
15711574
for atomic.Load(&sched.sysmonStarting) != 0 {
15721575
osyield()
15731576
}
1577+
1578+
// We don't want this thread to handle signals for the
1579+
// duration of this critical section. The underlying issue
1580+
// being that this locked coordinating m is the one monitoring
1581+
// for fn() execution by all the other m's of the runtime,
1582+
// while no regular go code execution is permitted (the world
1583+
// is stopped). If this present m were to get distracted to
1584+
// run signal handling code, and find itself waiting for a
1585+
// second thread to execute go code before being able to
1586+
// return from that signal handling, a deadlock will result.
1587+
// (See golang.org/issue/44193.)
1588+
lockOSThread()
1589+
var sigmask sigset
1590+
sigsave(&sigmask)
1591+
sigblock(false)
1592+
15741593
stopTheWorldGC("doAllThreadsSyscall")
15751594
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
15761595
// Ensure that there are no in-flight thread
@@ -1622,6 +1641,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
16221641
// the possibility of racing with mp.
16231642
lock(&mp.mFixup.lock)
16241643
mp.mFixup.fn = fn
1644+
atomic.Store(&mp.mFixup.used, 1)
16251645
if mp.doesPark {
16261646
// For non-service threads this will
16271647
// cause the wakeup to be short lived
@@ -1638,9 +1658,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
16381658
if mp.procid == tid {
16391659
continue
16401660
}
1641-
lock(&mp.mFixup.lock)
1642-
done = done && (mp.mFixup.fn == nil)
1643-
unlock(&mp.mFixup.lock)
1661+
done = atomic.Load(&mp.mFixup.used) == 0
16441662
}
16451663
if done {
16461664
break
@@ -1667,6 +1685,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
16671685
unlock(&mFixupRace.lock)
16681686
}
16691687
startTheWorldGC()
1688+
msigrestore(sigmask)
1689+
unlockOSThread()
16701690
}
16711691

16721692
// runSafePointFn runs the safe point function, if any, for this P.
@@ -2157,9 +2177,21 @@ var mFixupRace struct {
21572177
// mDoFixup runs any outstanding fixup function for the running m.
21582178
// Returns true if a fixup was outstanding and actually executed.
21592179
//
2180+
// Note: to avoid deadlocks, and the need for the fixup function
2181+
// itself to be async safe, signals are blocked for the working m
2182+
// while it holds the mFixup lock. (See golang.org/issue/44193)
2183+
//
21602184
//go:nosplit
21612185
func mDoFixup() bool {
21622186
_g_ := getg()
2187+
if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
2188+
return false
2189+
}
2190+
2191+
// slow path - if fixup fn is used, block signals and lock.
2192+
var sigmask sigset
2193+
sigsave(&sigmask)
2194+
sigblock(false)
21632195
lock(&_g_.m.mFixup.lock)
21642196
fn := _g_.m.mFixup.fn
21652197
if fn != nil {
@@ -2176,7 +2208,6 @@ func mDoFixup() bool {
21762208
// is more obviously safe.
21772209
throw("GC must be disabled to protect validity of fn value")
21782210
}
2179-
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
21802211
if _g_.racectx != 0 || !raceenabled {
21812212
fn(false)
21822213
} else {
@@ -2191,11 +2222,24 @@ func mDoFixup() bool {
21912222
_g_.racectx = 0
21922223
unlock(&mFixupRace.lock)
21932224
}
2225+
*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
2226+
atomic.Store(&_g_.m.mFixup.used, 0)
21942227
}
21952228
unlock(&_g_.m.mFixup.lock)
2229+
msigrestore(sigmask)
21962230
return fn != nil
21972231
}
21982232

2233+
// mDoFixupAndOSYield is called when an m is unable to send a signal
2234+
// because the allThreadsSyscall mechanism is in progress. That is, an
2235+
// mPark() has been interrupted with this signal handler so we need to
2236+
// ensure the fixup is executed from this context.
2237+
//go:nosplit
2238+
func mDoFixupAndOSYield() {
2239+
mDoFixup()
2240+
osyield()
2241+
}
2242+
21992243
// templateThread is a thread in a known-good state that exists solely
22002244
// to start new threads in known-good states when the calling thread
22012245
// may not be in a good state.

src/runtime/runtime2.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,10 +537,13 @@ type m struct {
537537
syscalltick uint32
538538
freelink *m // on sched.freem
539539

540-
// mFixup is used to synchronize OS related m state (credentials etc)
541-
// use mutex to access.
540+
// mFixup is used to synchronize OS related m state
541+
// (credentials etc) use mutex to access. To avoid deadlocks
542+
// an atomic.Load() of used being zero in mDoFixupFn()
543+
// guarantees fn is nil.
542544
mFixup struct {
543545
lock mutex
546+
used uint32
544547
fn func(bool) bool
545548
}
546549

src/runtime/sigqueue.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ Send:
119119
}
120120
case sigFixup:
121121
// nothing to do - we need to wait for sigIdle.
122-
osyield()
122+
mDoFixupAndOSYield()
123123
}
124124
}
125125

0 commit comments

Comments
 (0)