Skip to content

Commit 0d8366e

Browse files
author
Paul Marks
committed
net: add sequential and RFC 6555-compliant TCP dialing.
dialSerial connects to a list of addresses in sequence. If a timeout is specified, then each address gets an equal fraction of the remaining time, with a magic constant (2 seconds) to prevent "dial a million addresses" from allotting zero time to each. Normally, net.Dial passes the DNS stub resolver's output to dialSerial. If an error occurs (like destination/port unreachable), it quickly skips to the next address, but a blackhole in the network will cause the connection to hang until the timeout elapses. This is how UNIXy clients traditionally behave, and is usually sufficient for non-broken networks. The DualStack flag enables dialParallel, which implements Happy Eyeballs by racing two dialSerial goroutines, giving the preferred family a head start (300ms by default). This allows clients to avoid long timeouts when the network blackholes IPv4 xor IPv6. Fixes #8453 Fixes #8455 Fixes #8847 Change-Id: Ie415809c9226a1f7342b0217dcdd8f224ae19058 Reviewed-on: https://go-review.googlesource.com/8768 Reviewed-by: Mikio Hara <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 12b05bf commit 0d8366e

File tree

4 files changed

+566
-87
lines changed

4 files changed

+566
-87
lines changed

src/net/dial.go

Lines changed: 184 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ type Dialer struct {
2121
//
2222
// The default is no timeout.
2323
//
24+
// When dialing a name with multiple IP addresses, the timeout
25+
// may be divided between them.
26+
//
2427
// With or without a timeout, the operating system may impose
2528
// its own earlier timeout. For instance, TCP timeouts are
2629
// often around 3 minutes.
@@ -38,13 +41,17 @@ type Dialer struct {
3841
// If nil, a local address is automatically chosen.
3942
LocalAddr Addr
4043

41-
// DualStack allows a single dial to attempt to establish
42-
// multiple IPv4 and IPv6 connections and to return the first
43-
// established connection when the network is "tcp" and the
44-
// destination is a host name that has multiple address family
45-
// DNS records.
44+
// DualStack enables RFC 6555-compliant "Happy Eyeballs" dialing
45+
// when the network is "tcp" and the destination is a host name
46+
// with both IPv4 and IPv6 addresses. This allows a client to
47+
// tolerate networks where one address family is silently broken.
4648
DualStack bool
4749

50+
// FallbackDelay specifies the length of time to wait before
51+
// spawning a fallback connection, when DualStack is enabled.
52+
// If zero, a default delay of 300ms is used.
53+
FallbackDelay time.Duration
54+
4855
// KeepAlive specifies the keep-alive period for an active
4956
// network connection.
5057
// If zero, keep-alives are not enabled. Network protocols
@@ -54,18 +61,51 @@ type Dialer struct {
5461

5562
// Return either now+Timeout or Deadline, whichever comes first.
5663
// Or zero, if neither is set.
57-
func (d *Dialer) deadline() time.Time {
64+
func (d *Dialer) deadline(now time.Time) time.Time {
5865
if d.Timeout == 0 {
5966
return d.Deadline
6067
}
61-
timeoutDeadline := time.Now().Add(d.Timeout)
68+
timeoutDeadline := now.Add(d.Timeout)
6269
if d.Deadline.IsZero() || timeoutDeadline.Before(d.Deadline) {
6370
return timeoutDeadline
6471
} else {
6572
return d.Deadline
6673
}
6774
}
6875

76+
// partialDeadline returns the deadline to use for a single address,
77+
// when multiple addresses are pending.
78+
func (d *Dialer) partialDeadline(now time.Time, addrsRemaining int) (time.Time, error) {
79+
deadline := d.deadline(now)
80+
if deadline.IsZero() {
81+
return deadline, nil
82+
}
83+
timeRemaining := deadline.Sub(now)
84+
if timeRemaining <= 0 {
85+
return time.Time{}, errTimeout
86+
}
87+
// Tentatively allocate equal time to each remaining address.
88+
timeout := timeRemaining / time.Duration(addrsRemaining)
89+
// If the time per address is too short, steal from the end of the list.
90+
const saneMinimum = 2 * time.Second
91+
if timeout < saneMinimum {
92+
if timeRemaining < saneMinimum {
93+
timeout = timeRemaining
94+
} else {
95+
timeout = saneMinimum
96+
}
97+
}
98+
return now.Add(timeout), nil
99+
}
100+
101+
func (d *Dialer) fallbackDelay() time.Duration {
102+
if d.FallbackDelay > 0 {
103+
return d.FallbackDelay
104+
} else {
105+
return 300 * time.Millisecond
106+
}
107+
}
108+
69109
func parseNetwork(net string) (afnet string, proto int, err error) {
70110
i := last(net, ':')
71111
if i < 0 { // no colon
@@ -154,30 +194,44 @@ func DialTimeout(network, address string, timeout time.Duration) (Conn, error) {
154194
return d.Dial(network, address)
155195
}
156196

197+
// dialContext holds common state for all dial operations.
198+
type dialContext struct {
199+
Dialer
200+
network, address string
201+
}
202+
157203
// Dial connects to the address on the named network.
158204
//
159205
// See func Dial for a description of the network and address
160206
// parameters.
161207
func (d *Dialer) Dial(network, address string) (Conn, error) {
162-
addrs, err := resolveAddrList("dial", network, address, d.deadline())
208+
addrs, err := resolveAddrList("dial", network, address, d.deadline(time.Now()))
163209
if err != nil {
164210
return nil, &OpError{Op: "dial", Net: network, Source: nil, Addr: nil, Err: err}
165211
}
166-
var dialer func(deadline time.Time) (Conn, error)
212+
213+
ctx := &dialContext{
214+
Dialer: *d,
215+
network: network,
216+
address: address,
217+
}
218+
219+
var primaries, fallbacks addrList
167220
if d.DualStack && network == "tcp" {
168-
primaries, fallbacks := addrs.partition(isIPv4)
169-
if len(fallbacks) > 0 {
170-
dialer = func(deadline time.Time) (Conn, error) {
171-
return dialMulti(network, address, d.LocalAddr, addrList{primaries[0], fallbacks[0]}, deadline)
172-
}
173-
}
221+
primaries, fallbacks = addrs.partition(isIPv4)
222+
} else {
223+
primaries = addrs
174224
}
175-
if dialer == nil {
176-
dialer = func(deadline time.Time) (Conn, error) {
177-
return dialSingle(network, address, d.LocalAddr, addrs.first(isIPv4), deadline)
178-
}
225+
226+
var c Conn
227+
if len(fallbacks) == 0 {
228+
// dialParallel can accept an empty fallbacks list,
229+
// but this shortcut avoids the goroutine/channel overhead.
230+
c, err = dialSerial(ctx, primaries, nil)
231+
} else {
232+
c, err = dialParallel(ctx, primaries, fallbacks)
179233
}
180-
c, err := dial(network, addrs.first(isIPv4), dialer, d.deadline())
234+
181235
if d.KeepAlive > 0 && err == nil {
182236
if tc, ok := c.(*TCPConn); ok {
183237
setKeepAlive(tc.fd, true)
@@ -188,70 +242,135 @@ func (d *Dialer) Dial(network, address string) (Conn, error) {
188242
return c, err
189243
}
190244

191-
// dialMulti attempts to establish connections to each destination of
192-
// the list of addresses. It will return the first established
193-
// connection and close the other connections. Otherwise it returns
194-
// error on the last attempt.
195-
func dialMulti(net, addr string, la Addr, ras addrList, deadline time.Time) (Conn, error) {
196-
type racer struct {
197-
Conn
198-
error
245+
// dialParallel races two copies of dialSerial, giving the first a
246+
// head start. It returns the first established connection and
247+
// closes the others. Otherwise it returns an error from the first
248+
// primary address.
249+
func dialParallel(ctx *dialContext, primaries, fallbacks addrList) (Conn, error) {
250+
results := make(chan dialResult) // unbuffered, so dialSerialAsync can detect race loss & cleanup
251+
cancel := make(chan struct{})
252+
defer close(cancel)
253+
254+
// Spawn the primary racer.
255+
go dialSerialAsync(ctx, primaries, nil, cancel, results)
256+
257+
// Spawn the fallback racer.
258+
fallbackTimer := time.NewTimer(ctx.fallbackDelay())
259+
go dialSerialAsync(ctx, fallbacks, fallbackTimer, cancel, results)
260+
261+
var primaryErr error
262+
for nracers := 2; nracers > 0; nracers-- {
263+
res := <-results
264+
// If we're still waiting for a connection, then hasten the delay.
265+
// Otherwise, disable the Timer and let cancel take over.
266+
if fallbackTimer.Stop() && res.error != nil {
267+
fallbackTimer.Reset(0)
268+
}
269+
if res.error == nil {
270+
return res.Conn, nil
271+
}
272+
if res.primary {
273+
primaryErr = res.error
274+
}
275+
}
276+
return nil, primaryErr
277+
}
278+
279+
type dialResult struct {
280+
Conn
281+
error
282+
primary bool
283+
}
284+
285+
// dialSerialAsync runs dialSerial after some delay, and returns the
286+
// resulting connection through a channel. When racing two connections,
287+
// the primary goroutine uses a nil timer to omit the delay.
288+
func dialSerialAsync(ctx *dialContext, ras addrList, timer *time.Timer, cancel <-chan struct{}, results chan<- dialResult) {
289+
if timer != nil {
290+
// We're in the fallback goroutine; sleep before connecting.
291+
select {
292+
case <-timer.C:
293+
case <-cancel:
294+
return
295+
}
199296
}
200-
// Sig controls the flow of dial results on lane. It passes a
201-
// token to the next racer and also indicates the end of flow
202-
// by using closed channel.
203-
sig := make(chan bool, 1)
204-
lane := make(chan racer, 1)
205-
for _, ra := range ras {
206-
go func(ra Addr) {
207-
c, err := dialSingle(net, addr, la, ra, deadline)
208-
if _, ok := <-sig; ok {
209-
lane <- racer{c, err}
210-
} else if err == nil {
211-
// We have to return the resources
212-
// that belong to the other
213-
// connections here for avoiding
214-
// unnecessary resource starvation.
215-
c.Close()
216-
}
217-
}(ra)
297+
c, err := dialSerial(ctx, ras, cancel)
298+
select {
299+
case results <- dialResult{c, err, timer == nil}:
300+
// We won the race.
301+
case <-cancel:
302+
// The other goroutine won the race.
303+
if c != nil {
304+
c.Close()
305+
}
218306
}
219-
defer close(sig)
220-
lastErr := errTimeout
221-
nracers := len(ras)
222-
for nracers > 0 {
223-
sig <- true
224-
racer := <-lane
225-
if racer.error == nil {
226-
return racer.Conn, nil
307+
}
308+
309+
// dialSerial connects to a list of addresses in sequence, returning
310+
// either the first successful connection, or the first error.
311+
func dialSerial(ctx *dialContext, ras addrList, cancel <-chan struct{}) (Conn, error) {
312+
var firstErr error // The error from the first address is most relevant.
313+
314+
for i, ra := range ras {
315+
select {
316+
case <-cancel:
317+
return nil, &OpError{Op: "dial", Net: ctx.network, Source: ctx.LocalAddr, Addr: ra, Err: errCanceled}
318+
default:
319+
}
320+
321+
partialDeadline, err := ctx.partialDeadline(time.Now(), len(ras)-i)
322+
if err != nil {
323+
// Ran out of time.
324+
if firstErr == nil {
325+
firstErr = &OpError{Op: "dial", Net: ctx.network, Source: ctx.LocalAddr, Addr: ra, Err: err}
326+
}
327+
break
227328
}
228-
lastErr = racer.error
229-
nracers--
329+
330+
// dialTCP does not support cancelation (see golang.org/issue/11225),
331+
// so if cancel fires, we'll continue trying to connect until the next
332+
// timeout, or return a spurious connection for the caller to close.
333+
dialer := func(d time.Time) (Conn, error) {
334+
return dialSingle(ctx, ra, d)
335+
}
336+
c, err := dial(ctx.network, ra, dialer, partialDeadline)
337+
if err == nil {
338+
return c, nil
339+
}
340+
if firstErr == nil {
341+
firstErr = err
342+
}
343+
}
344+
345+
if firstErr == nil {
346+
firstErr = &OpError{Op: "dial", Net: ctx.network, Source: nil, Addr: nil, Err: errMissingAddress}
230347
}
231-
return nil, lastErr
348+
return nil, firstErr
232349
}
233350

234351
// dialSingle attempts to establish and returns a single connection to
235-
// the destination address.
236-
func dialSingle(net, addr string, la, ra Addr, deadline time.Time) (c Conn, err error) {
352+
// the destination address. This must be called through the OS-specific
353+
// dial function, because some OSes don't implement the deadline feature.
354+
func dialSingle(ctx *dialContext, ra Addr, deadline time.Time) (c Conn, err error) {
355+
la := ctx.LocalAddr
237356
if la != nil && la.Network() != ra.Network() {
238-
return nil, &OpError{Op: "dial", Net: net, Source: la, Addr: ra, Err: errors.New("mismatched local address type " + la.Network())}
357+
return nil, &OpError{Op: "dial", Net: ctx.network, Source: la, Addr: ra, Err: errors.New("mismatched local address type " + la.Network())}
239358
}
240359
switch ra := ra.(type) {
241360
case *TCPAddr:
242361
la, _ := la.(*TCPAddr)
243-
c, err = dialTCP(net, la, ra, deadline)
362+
c, err = testHookDialTCP(ctx.network, la, ra, deadline)
244363
case *UDPAddr:
245364
la, _ := la.(*UDPAddr)
246-
c, err = dialUDP(net, la, ra, deadline)
365+
c, err = dialUDP(ctx.network, la, ra, deadline)
247366
case *IPAddr:
248367
la, _ := la.(*IPAddr)
249-
c, err = dialIP(net, la, ra, deadline)
368+
c, err = dialIP(ctx.network, la, ra, deadline)
250369
case *UnixAddr:
251370
la, _ := la.(*UnixAddr)
252-
c, err = dialUnix(net, la, ra, deadline)
371+
c, err = dialUnix(ctx.network, la, ra, deadline)
253372
default:
254-
return nil, &OpError{Op: "dial", Net: net, Source: la, Addr: ra, Err: &AddrError{Err: "unexpected address type", Addr: addr}}
373+
return nil, &OpError{Op: "dial", Net: ctx.network, Source: la, Addr: ra, Err: &AddrError{Err: "unexpected address type", Addr: ctx.address}}
255374
}
256375
if err != nil {
257376
return nil, err // c is non-nil interface containing nil pointer

0 commit comments

Comments
 (0)