Skip to content

Commit 4e69fdd

Browse files
rscianlancetaylor
authored andcommitted
[release-branch.go1.17] runtime: fix net poll races
The netpoll code was written long ago, when the only multiprocessors that Go ran on were x86. It assumed that an atomic store would trigger a full memory barrier and then used that barrier to order otherwise racy access to a handful of fields, including pollDesc.closing. On ARM64, this code has finally failed, because the atomic store is on a value completely unrelated to any of the racily-accessed fields, and the ARMv8 hardware, unlike x86, is clever enough not to do a full memory barrier for a simple atomic store. We are seeing a constant background rate of trybot failures where the net/http tests deadlock - a netpollblock has clearly happened after the pollDesc has begun to close. The code that does the racy reads is netpollcheckerr, which needs to be able to run without acquiring a lock. This CL fixes the race, without introducing unnecessary inefficiency or deadlock, by arranging for every updater of the relevant fields to publish a summary as a single atomic uint32, and then having netpollcheckerr use a single atomic load to fetch the relevant bits and then proceed as before. For #45211 Fixes #50611 Change-Id: Ib6788c8da4d00b7bda84d55ca3fdffb5a64c1a0a Reviewed-on: https://go-review.googlesource.com/c/go/+/378234 Trust: Russ Cox <[email protected]> Run-TryBot: Russ Cox <[email protected]> Trust: Bryan Mills <[email protected]> Reviewed-by: Ian Lance Taylor <[email protected]> (cherry picked from commit 17b2fb1) Reviewed-on: https://go-review.googlesource.com/c/go/+/392714 Trust: Ian Lance Taylor <[email protected]> Run-TryBot: Ian Lance Taylor <[email protected]> TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Emmanuel Odeke <[email protected]>
1 parent efed283 commit 4e69fdd

File tree

6 files changed

+105
-43
lines changed

6 files changed

+105
-43
lines changed

src/runtime/netpoll.go

Lines changed: 99 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -72,31 +72,99 @@ const pollBlockSize = 4 * 1024
7272
//go:notinheap
7373
type pollDesc struct {
7474
link *pollDesc // in pollcache, protected by pollcache.lock
75+
fd uintptr // constant for pollDesc usage lifetime
76+
77+
// atomicInfo holds bits from closing, rd, and wd,
78+
// which are only ever written while holding the lock,
79+
// summarized for use by netpollcheckerr,
80+
// which cannot acquire the lock.
81+
// After writing these fields under lock in a way that
82+
// might change the summary, code must call publishInfo
83+
// before releasing the lock.
84+
// Code that changes fields and then calls netpollunblock
85+
// (while still holding the lock) must call publishInfo
86+
// before calling netpollunblock, because publishInfo is what
87+
// stops netpollblock from blocking anew
88+
// (by changing the result of netpollcheckerr).
89+
// atomicInfo also holds the eventErr bit,
90+
// recording whether a poll event on the fd got an error;
91+
// atomicInfo is the only source of truth for that bit.
92+
atomicInfo uint32 // atomic pollInfo
93+
94+
// rg, wg are accessed atomically and hold g pointers.
95+
// (Using atomic.Uintptr here is similar to using guintptr elsewhere.)
96+
rg uintptr // pdReady, pdWait, G waiting for read or nil
97+
wg uintptr // pdReady, pdWait, G waiting for write or nil
7598

76-
// The lock protects pollOpen, pollSetDeadline, pollUnblock and deadlineimpl operations.
77-
// This fully covers seq, rt and wt variables. fd is constant throughout the PollDesc lifetime.
78-
// pollReset, pollWait, pollWaitCanceled and runtime·netpollready (IO readiness notification)
79-
// proceed w/o taking the lock. So closing, everr, rg, rd, wg and wd are manipulated
80-
// in a lock-free way by all operations.
81-
// TODO(golang.org/issue/49008): audit these lock-free fields for continued correctness.
82-
// NOTE(dvyukov): the following code uses uintptr to store *g (rg/wg),
83-
// that will blow up when GC starts moving objects.
8499
lock mutex // protects the following fields
85-
fd uintptr
86100
closing bool
87-
everr bool // marks event scanning error happened
88101
user uint32 // user settable cookie
89102
rseq uintptr // protects from stale read timers
90-
rg uintptr // pdReady, pdWait, G waiting for read or nil. Accessed atomically.
91103
rt timer // read deadline timer (set if rt.f != nil)
92-
rd int64 // read deadline
104+
rd int64 // read deadline (a nanotime in the future, -1 when expired)
93105
wseq uintptr // protects from stale write timers
94-
wg uintptr // pdReady, pdWait, G waiting for write or nil. Accessed atomically.
95106
wt timer // write deadline timer
96-
wd int64 // write deadline
107+
wd int64 // write deadline (a nanotime in the future, -1 when expired)
97108
self *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
98109
}
99110

111+
// pollInfo is the bits needed by netpollcheckerr, stored atomically,
112+
// mostly duplicating state that is manipulated under lock in pollDesc.
113+
// The one exception is the pollEventErr bit, which is maintained only
114+
// in the pollInfo.
115+
type pollInfo uint32
116+
117+
const (
118+
pollClosing = 1 << iota
119+
pollEventErr
120+
pollExpiredReadDeadline
121+
pollExpiredWriteDeadline
122+
)
123+
124+
func (i pollInfo) closing() bool { return i&pollClosing != 0 }
125+
func (i pollInfo) eventErr() bool { return i&pollEventErr != 0 }
126+
func (i pollInfo) expiredReadDeadline() bool { return i&pollExpiredReadDeadline != 0 }
127+
func (i pollInfo) expiredWriteDeadline() bool { return i&pollExpiredWriteDeadline != 0 }
128+
129+
// info returns the pollInfo corresponding to pd.
130+
func (pd *pollDesc) info() pollInfo {
131+
return pollInfo(atomic.Load(&pd.atomicInfo))
132+
}
133+
134+
// publishInfo updates pd.atomicInfo (returned by pd.info)
135+
// using the other values in pd.
136+
// It must be called while holding pd.lock,
137+
// and it must be called after changing anything
138+
// that might affect the info bits.
139+
// In practice this means after changing closing
140+
// or changing rd or wd from < 0 to >= 0.
141+
func (pd *pollDesc) publishInfo() {
142+
var info uint32
143+
if pd.closing {
144+
info |= pollClosing
145+
}
146+
if pd.rd < 0 {
147+
info |= pollExpiredReadDeadline
148+
}
149+
if pd.wd < 0 {
150+
info |= pollExpiredWriteDeadline
151+
}
152+
153+
// Set all of x except the pollEventErr bit.
154+
x := atomic.Load(&pd.atomicInfo)
155+
for !atomic.Cas(&pd.atomicInfo, x, (x&pollEventErr)|info) {
156+
x = atomic.Load(&pd.atomicInfo)
157+
}
158+
}
159+
160+
// setEventErr sets the result of pd.info().eventErr() to b.
161+
func (pd *pollDesc) setEventErr(b bool) {
162+
x := atomic.Load(&pd.atomicInfo)
163+
for (x&pollEventErr != 0) != b && !atomic.Cas(&pd.atomicInfo, x, x^pollEventErr) {
164+
x = atomic.Load(&pd.atomicInfo)
165+
}
166+
}
167+
100168
type pollCache struct {
101169
lock mutex
102170
first *pollDesc
@@ -158,14 +226,15 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
158226
}
159227
pd.fd = fd
160228
pd.closing = false
161-
pd.everr = false
229+
pd.setEventErr(false)
162230
pd.rseq++
163231
atomic.Storeuintptr(&pd.rg, 0)
164232
pd.rd = 0
165233
pd.wseq++
166234
atomic.Storeuintptr(&pd.wg, 0)
167235
pd.wd = 0
168236
pd.self = pd
237+
pd.publishInfo()
169238
unlock(&pd.lock)
170239

171240
errno := netpollopen(fd, pd)
@@ -274,6 +343,7 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
274343
if mode == 'w' || mode == 'r'+'w' {
275344
pd.wd = d
276345
}
346+
pd.publishInfo()
277347
combo := pd.rd > 0 && pd.rd == pd.wd
278348
rtf := netpollReadDeadline
279349
if combo {
@@ -315,15 +385,13 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
315385
}
316386
}
317387
// If we set the new deadline in the past, unblock currently pending IO if any.
388+
// Note that pd.publishInfo has already been called, above, immediately after modifying rd and wd.
318389
var rg, wg *g
319-
if pd.rd < 0 || pd.wd < 0 {
320-
atomic.StorepNoWB(noescape(unsafe.Pointer(&wg)), nil) // full memory barrier between stores to rd/wd and load of rg/wg in netpollunblock
321-
if pd.rd < 0 {
322-
rg = netpollunblock(pd, 'r', false)
323-
}
324-
if pd.wd < 0 {
325-
wg = netpollunblock(pd, 'w', false)
326-
}
390+
if pd.rd < 0 {
391+
rg = netpollunblock(pd, 'r', false)
392+
}
393+
if pd.wd < 0 {
394+
wg = netpollunblock(pd, 'w', false)
327395
}
328396
unlock(&pd.lock)
329397
if rg != nil {
@@ -344,7 +412,7 @@ func poll_runtime_pollUnblock(pd *pollDesc) {
344412
pd.rseq++
345413
pd.wseq++
346414
var rg, wg *g
347-
atomic.StorepNoWB(noescape(unsafe.Pointer(&rg)), nil) // full memory barrier between store to closing and read of rg/wg in netpollunblock
415+
pd.publishInfo()
348416
rg = netpollunblock(pd, 'r', false)
349417
wg = netpollunblock(pd, 'w', false)
350418
if pd.rt.f != nil {
@@ -389,16 +457,17 @@ func netpollready(toRun *gList, pd *pollDesc, mode int32) {
389457
}
390458

391459
func netpollcheckerr(pd *pollDesc, mode int32) int {
392-
if pd.closing {
460+
info := pd.info()
461+
if info.closing() {
393462
return pollErrClosing
394463
}
395-
if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
464+
if (mode == 'r' && info.expiredReadDeadline()) || (mode == 'w' && info.expiredWriteDeadline()) {
396465
return pollErrTimeout
397466
}
398467
// Report an event scanning error only on a read event.
399468
// An error on a write event will be captured in a subsequent
400469
// write call that is able to report a more specific error.
401-
if mode == 'r' && pd.everr {
470+
if mode == 'r' && info.eventErr() {
402471
return pollErrNotPollable
403472
}
404473
return pollNoError
@@ -449,7 +518,7 @@ func netpollblock(pd *pollDesc, mode int32, waitio bool) bool {
449518

450519
// need to recheck error states after setting gpp to pdWait
451520
// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
452-
// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
521+
// do the opposite: store to closing/rd/wd, publishInfo, load of rg/wg
453522
if waitio || netpollcheckerr(pd, mode) == 0 {
454523
gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
455524
}
@@ -509,7 +578,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
509578
throw("runtime: inconsistent read deadline")
510579
}
511580
pd.rd = -1
512-
atomic.StorepNoWB(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
581+
pd.publishInfo()
513582
rg = netpollunblock(pd, 'r', false)
514583
}
515584
var wg *g
@@ -518,7 +587,7 @@ func netpolldeadlineimpl(pd *pollDesc, seq uintptr, read, write bool) {
518587
throw("runtime: inconsistent write deadline")
519588
}
520589
pd.wd = -1
521-
atomic.StorepNoWB(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
590+
pd.publishInfo()
522591
wg = netpollunblock(pd, 'w', false)
523592
}
524593
unlock(&pd.lock)

src/runtime/netpoll_aix.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,7 @@ retry:
212212
pfd.events &= ^_POLLOUT
213213
}
214214
if mode != 0 {
215-
pds[i].everr = false
216-
if pfd.revents == _POLLERR {
217-
pds[i].everr = true
218-
}
215+
pds[i].setEventErr(pfd.revents == _POLLERR)
219216
netpollready(&toRun, pds[i], mode)
220217
n--
221218
}

src/runtime/netpoll_epoll.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,10 +169,7 @@ retry:
169169
}
170170
if mode != 0 {
171171
pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
172-
pd.everr = false
173-
if ev.events == _EPOLLERR {
174-
pd.everr = true
175-
}
172+
pd.setEventErr(ev.events == _EPOLLERR)
176173
netpollready(&toRun, pd, mode)
177174
}
178175
}

src/runtime/netpoll_kqueue.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,7 @@ retry:
180180
}
181181
if mode != 0 {
182182
pd := (*pollDesc)(unsafe.Pointer(ev.udata))
183-
pd.everr = false
184-
if ev.flags == _EV_ERROR {
185-
pd.everr = true
186-
}
183+
pd.setEventErr(ev.flags == _EV_ERROR)
187184
netpollready(&toRun, pd, mode)
188185
}
189186
}

src/runtime/netpoll_solaris.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ func netpollclose(fd uintptr) int32 {
158158
// this call, port_getn will return one and only one event for that
159159
// particular descriptor, so this function needs to be called again.
160160
func netpollupdate(pd *pollDesc, set, clear uint32) {
161-
if pd.closing {
161+
if pd.info().closing() {
162162
return
163163
}
164164

src/runtime/runtime2.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ func efaceOf(ep *interface{}) *eface {
255255
// so I can't see them ever moving. If we did want to start moving data
256256
// in the GC, we'd need to allocate the goroutine structs from an
257257
// alternate arena. Using guintptr doesn't make that problem any worse.
258+
// Note that pollDesc.rg, pollDesc.wg also store g in uintptr form,
259+
// so they would need to be updated too if g's start moving.
258260
type guintptr uintptr
259261

260262
//go:nosplit

0 commit comments

Comments
 (0)