Skip to content

Commit ff2aa05

Browse files
authored
internal: fix GO_AWAY deadlock (#2391)
internal: fix GO_AWAY deadlock A deadlock can occur when a GO_AWAY is followed by a connection closure. This happens because onClose needlessly closes the current ac.transport: if a GO_AWAY already occured, and the transport was already reset, then the later closure (of the original address) sets ac.transport - which is now healthy - to nil. The manifestation of this problem is that picker_wrapper spins forever trying to use a READY connection whose ac.transport is nil.
1 parent 39444b9 commit ff2aa05

File tree

2 files changed

+109
-3
lines changed

2 files changed

+109
-3
lines changed

clientconn.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,9 +1066,6 @@ func (ac *addrConn) createTransport(backoffNum int, addr resolver.Address, copts
10661066
case <-skipReset: // The outer resetTransport loop will handle reconnection.
10671067
return
10681068
case <-allowedToReset: // We're in the clear to reset.
1069-
ac.mu.Lock()
1070-
ac.transport = nil
1071-
ac.mu.Unlock()
10721069
oneReset.Do(func() { ac.resetTransport(false) })
10731070
}
10741071
}

test/end2end_test.go

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ import (
5757
healthgrpc "google.golang.org/grpc/health/grpc_health_v1"
5858
healthpb "google.golang.org/grpc/health/grpc_health_v1"
5959
"google.golang.org/grpc/internal/channelz"
60+
"google.golang.org/grpc/internal/grpcsync"
6061
"google.golang.org/grpc/internal/leakcheck"
6162
"google.golang.org/grpc/internal/testutils"
6263
"google.golang.org/grpc/keepalive"
@@ -6994,3 +6995,111 @@ func testLargeTimeout(t *testing.T, e env) {
69946995
}
69956996
}
69966997
}
6998+
6999+
// Proxies typically send GO_AWAY followed by connection closure a minute or so later. This
7000+
// test ensures that the connection is re-created after GO_AWAY and not affected by the
7001+
// subsequent (old) connection closure.
7002+
func TestGoAwayThenClose(t *testing.T) {
7003+
defer leakcheck.Check(t)
7004+
7005+
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
7006+
defer cancel()
7007+
7008+
lis1, err := net.Listen("tcp", "localhost:0")
7009+
if err != nil {
7010+
t.Fatalf("Error while listening. Err: %v", err)
7011+
}
7012+
s1 := grpc.NewServer()
7013+
defer s1.Stop()
7014+
ts1 := &funcServer{
7015+
unaryCall: func(ctx context.Context, in *testpb.SimpleRequest) (*testpb.SimpleResponse, error) {
7016+
return &testpb.SimpleResponse{}, nil
7017+
},
7018+
fullDuplexCall: func(stream testpb.TestService_FullDuplexCallServer) error {
7019+
// Wait forever.
7020+
_, err := stream.Recv()
7021+
if err == nil {
7022+
t.Error("expected to never receive any message")
7023+
}
7024+
return err
7025+
},
7026+
}
7027+
testpb.RegisterTestServiceServer(s1, ts1)
7028+
go s1.Serve(lis1)
7029+
7030+
conn2Established := grpcsync.NewEvent()
7031+
lis2, err := listenWithNotifyingListener("tcp", "localhost:0", conn2Established)
7032+
if err != nil {
7033+
t.Fatalf("Error while listening. Err: %v", err)
7034+
}
7035+
s2 := grpc.NewServer()
7036+
defer s2.Stop()
7037+
conn2Ready := grpcsync.NewEvent()
7038+
ts2 := &funcServer{unaryCall: func(ctx context.Context, in *testpb.SimpleRequest) (*testpb.SimpleResponse, error) {
7039+
conn2Ready.Fire()
7040+
return &testpb.SimpleResponse{}, nil
7041+
}}
7042+
testpb.RegisterTestServiceServer(s2, ts2)
7043+
go s2.Serve(lis2)
7044+
7045+
r, rcleanup := manual.GenerateAndRegisterManualResolver()
7046+
defer rcleanup()
7047+
r.InitialAddrs([]resolver.Address{
7048+
{Addr: lis1.Addr().String()},
7049+
{Addr: lis2.Addr().String()},
7050+
})
7051+
cc, err := grpc.DialContext(ctx, r.Scheme()+":///", grpc.WithInsecure())
7052+
if err != nil {
7053+
t.Fatalf("Error creating client: %v", err)
7054+
}
7055+
defer cc.Close()
7056+
7057+
client := testpb.NewTestServiceClient(cc)
7058+
7059+
// Should go on connection 1. We use a long-lived RPC because it will cause GracefulStop to send GO_AWAY, but the
7060+
// connection doesn't get closed until the server stops and the client receives.
7061+
stream, err := client.FullDuplexCall(ctx)
7062+
if err != nil {
7063+
t.Fatalf("FullDuplexCall(_) = _, %v; want _, nil", err)
7064+
}
7065+
7066+
// Send GO_AWAY to connection 1.
7067+
go s1.GracefulStop()
7068+
7069+
// Wait for connection 2 to be established.
7070+
<-conn2Established.Done()
7071+
7072+
// Close connection 1.
7073+
s1.Stop()
7074+
7075+
// Wait for client to close.
7076+
_, err = stream.Recv()
7077+
if err == nil {
7078+
t.Fatal("expected the stream to die, but got a successful Recv")
7079+
}
7080+
7081+
// Do a bunch of RPCs, make sure it stays stable. These should go to connection 2.
7082+
for i := 0; i < 10; i++ {
7083+
if _, err := client.UnaryCall(ctx, &testpb.SimpleRequest{}); err != nil {
7084+
t.Fatalf("UnaryCall(_) = _, %v; want _, nil", err)
7085+
}
7086+
}
7087+
}
7088+
7089+
func listenWithNotifyingListener(network, address string, event *grpcsync.Event) (net.Listener, error) {
7090+
lis, err := net.Listen(network, address)
7091+
if err != nil {
7092+
return nil, err
7093+
}
7094+
return notifyingListener{connEstablished: event, Listener: lis}, nil
7095+
}
7096+
7097+
type notifyingListener struct {
7098+
connEstablished *grpcsync.Event
7099+
net.Listener
7100+
}
7101+
7102+
func (lis notifyingListener) Accept() (net.Conn, error) {
7103+
defer lis.connEstablished.Fire()
7104+
return lis.Listener.Accept()
7105+
}

0 commit comments

Comments
 (0)