Skip to content

Commit ea62bdd

Browse files
authored
feat: add health checks to proxy (#859)
## Change Description Introduce health checks to the Cloud SQL proxy client, allowing for the proactive and automatic mitigation of health-related issues. The health checks consist of **startup**, **liveness**, and **readiness** probing, with requests against the proxy container issued via HTTP. ## Checklist - [x] Make sure to open an issue as a [bug/issue](https://github.com/GoogleCloudPlatform/cloudsql-proxy/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea. - [x] Ensure the tests and linter pass - [x] Appropriate documentation is updated (if necessary) ## Relevant issues: - Fixes #137
1 parent 2c2bc8a commit ea62bdd

File tree

7 files changed

+528
-0
lines changed

7 files changed

+528
-0
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,15 @@ message and optionally stacktrace. For example, the startup message looks like:
283283
{"severity":"INFO","timestamp":"2020-10-12T07:20:50.52Z","caller":"cloud_sql_proxy/cloud_sql_proxy.go:510","message":"Using gcloud's active project: [my-project-id]"}
284284
```
285285

286+
#### `-use_http_health_check`
287+
288+
Enables HTTP health checks for the proxy, including startup, liveness, and readiness probing.
289+
Requires that you configure the Kubernetes container with HTTP probes ([instructions][health-check-example]).
290+
291+
#### `-health_check_port=8090`
292+
293+
Specifies the port that the health check server listens and serves on. Defaults to 8090.
294+
286295
## Running as a Kubernetes Sidecar
287296

288297
See the [example here][sidecar-example] as well as [Connecting from Google
@@ -334,6 +343,7 @@ Install via Nuget, follow these
334343
[connect-to-k8s]: https://cloud.google.com/sql/docs/mysql/connect-kubernetes-engine
335344
[connection-overview]: https://cloud.google.com/sql/docs/mysql/connect-overview
336345
[contributing]: CONTRIBUTING.md
346+
[health-check-example]: https://github.com/GoogleCloudPlatform/cloudsql-proxy/tree/main/examples/k8s-health-check#cloud-sql-proxy-health-checks
337347
[iam-auth]: https://cloud.google.com/sql/docs/postgres/authentication
338348
[pkg-badge]: https://pkg.go.dev/badge/github.com/GoogleCloudPlatform/cloudsql-proxy.svg
339349
[pkg-docs]: https://pkg.go.dev/github.com/GoogleCloudPlatform/cloudsql-proxy

cmd/cloud_sql_proxy/cloud_sql_proxy.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"syscall"
3434
"time"
3535

36+
"github.com/GoogleCloudPlatform/cloudsql-proxy/cmd/cloud_sql_proxy/internal/healthcheck"
3637
"github.com/GoogleCloudPlatform/cloudsql-proxy/logging"
3738
"github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/certs"
3839
"github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/fuse"
@@ -131,6 +132,10 @@ unavailable.`,
131132
`When set, the proxy uses this host as the base API path. Example:
132133
https://sqladmin.googleapis.com`,
133134
)
135+
136+
// Settings for healthcheck
137+
useHTTPHealthCheck = flag.Bool("use_http_health_check", false, "When set, creates an HTTP server that checks and communicates the health of the proxy client.")
138+
healthCheckPort = flag.String("health_check_port", "8090", "When applicable, health checks take place on this port number. Defaults to 8090.")
134139
)
135140

136141
const (
@@ -580,6 +585,16 @@ func main() {
580585
RefreshCfgBuffer: refreshCfgBuffer,
581586
}
582587

588+
var hc *healthcheck.Server
589+
if *useHTTPHealthCheck {
590+
hc, err = healthcheck.NewServer(proxyClient, *healthCheckPort)
591+
if err != nil {
592+
logging.Errorf("Could not initialize health check server: %v", err)
593+
os.Exit(1)
594+
}
595+
defer hc.Close(ctx)
596+
}
597+
583598
// Initialize a source of new connections to Cloud SQL instances.
584599
var connSrc <-chan proxy.Conn
585600
if *useFuse {
@@ -619,6 +634,10 @@ func main() {
619634

620635
logging.Infof("Ready for new connections")
621636

637+
if hc != nil {
638+
hc.NotifyStarted()
639+
}
640+
622641
signals := make(chan os.Signal, 1)
623642
signal.Notify(signals, syscall.SIGTERM, syscall.SIGINT)
624643

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
// Copyright 2021 Google LLC All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// Package healthcheck tests and communicates the health of the Cloud SQL Auth proxy.
16+
package healthcheck
17+
18+
import (
19+
"context"
20+
"errors"
21+
"net"
22+
"net/http"
23+
"sync"
24+
25+
"github.com/GoogleCloudPlatform/cloudsql-proxy/logging"
26+
"github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/proxy"
27+
)
28+
29+
const (
30+
startupPath = "/startup"
31+
livenessPath = "/liveness"
32+
readinessPath = "/readiness"
33+
)
34+
35+
// Server is a type used to implement health checks for the proxy.
36+
type Server struct {
37+
// started is used to indicate whether the proxy has finished starting up.
38+
// If started is open, startup has not finished. If started is closed,
39+
// startup is complete.
40+
started chan struct{}
41+
// once ensures that started can only be closed once.
42+
once *sync.Once
43+
// port designates the port number on which Server listens and serves.
44+
port string
45+
// srv is a pointer to the HTTP server used to communicate proxy health.
46+
srv *http.Server
47+
}
48+
49+
// NewServer initializes a Server and exposes HTTP endpoints used to
50+
// communicate proxy health.
51+
func NewServer(c *proxy.Client, port string) (*Server, error) {
52+
mux := http.NewServeMux()
53+
54+
srv := &http.Server{
55+
Addr: ":" + port,
56+
Handler: mux,
57+
}
58+
59+
hcServer := &Server{
60+
started: make(chan struct{}),
61+
once: &sync.Once{},
62+
port: port,
63+
srv: srv,
64+
}
65+
66+
mux.HandleFunc(startupPath, func(w http.ResponseWriter, _ *http.Request) {
67+
if !hcServer.proxyStarted() {
68+
w.WriteHeader(http.StatusServiceUnavailable)
69+
w.Write([]byte("error"))
70+
return
71+
}
72+
w.WriteHeader(http.StatusOK)
73+
w.Write([]byte("ok"))
74+
})
75+
76+
mux.HandleFunc(readinessPath, func(w http.ResponseWriter, _ *http.Request) {
77+
if !isReady(c, hcServer) {
78+
w.WriteHeader(http.StatusServiceUnavailable)
79+
w.Write([]byte("error"))
80+
return
81+
}
82+
w.WriteHeader(http.StatusOK)
83+
w.Write([]byte("ok"))
84+
})
85+
86+
mux.HandleFunc(livenessPath, func(w http.ResponseWriter, _ *http.Request) {
87+
if !isLive() { // Because isLive() always returns true, this case should not be reached.
88+
w.WriteHeader(http.StatusServiceUnavailable)
89+
w.Write([]byte("error"))
90+
return
91+
}
92+
w.WriteHeader(http.StatusOK)
93+
w.Write([]byte("ok"))
94+
})
95+
96+
ln, err := net.Listen("tcp", srv.Addr)
97+
if err != nil {
98+
return nil, err
99+
}
100+
101+
go func() {
102+
if err := srv.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) {
103+
logging.Errorf("Failed to start health check HTTP server: %v", err)
104+
}
105+
}()
106+
107+
return hcServer, nil
108+
}
109+
110+
// Close gracefully shuts down the HTTP server belonging to the Server.
111+
func (s *Server) Close(ctx context.Context) error {
112+
return s.srv.Shutdown(ctx)
113+
}
114+
115+
// NotifyStarted tells the Server that the proxy has finished startup.
116+
func (s *Server) NotifyStarted() {
117+
s.once.Do(func() { close(s.started) })
118+
}
119+
120+
// proxyStarted returns true if started is closed, false otherwise.
121+
func (s *Server) proxyStarted() bool {
122+
select {
123+
case <-s.started:
124+
return true
125+
default:
126+
return false
127+
}
128+
}
129+
130+
// isLive returns true as long as the proxy is running.
131+
func isLive() bool {
132+
return true
133+
}
134+
135+
// isReady will check the following criteria before determining whether the
136+
// proxy is ready for new connections.
137+
// 1. Finished starting up / been sent the 'Ready for Connections' log.
138+
// 2. Not yet hit the MaxConnections limit, if applicable.
139+
func isReady(c *proxy.Client, s *Server) bool {
140+
// Not ready until we reach the 'Ready for Connections' log
141+
if !s.proxyStarted() {
142+
logging.Errorf("Readiness failed because proxy has not finished starting up.")
143+
return false
144+
}
145+
146+
// Not ready if the proxy is at the optional MaxConnections limit.
147+
if !c.AvailableConn() {
148+
logging.Errorf("Readiness failed because proxy has reached the maximum connections limit (%d).", c.MaxConnections)
149+
return false
150+
}
151+
152+
return true
153+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
// Copyright 2021 Google LLC All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package healthcheck_test
16+
17+
import (
18+
"context"
19+
"net/http"
20+
"testing"
21+
22+
"github.com/GoogleCloudPlatform/cloudsql-proxy/cmd/cloud_sql_proxy/internal/healthcheck"
23+
"github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/proxy"
24+
)
25+
26+
const (
27+
startupPath = "/startup"
28+
livenessPath = "/liveness"
29+
readinessPath = "/readiness"
30+
testPort = "8090"
31+
)
32+
33+
// Test to verify that when the proxy client is up, the liveness endpoint writes http.StatusOK.
34+
func TestLiveness(t *testing.T) {
35+
s, err := healthcheck.NewServer(&proxy.Client{}, testPort)
36+
if err != nil {
37+
t.Fatalf("Could not initialize health check: %v", err)
38+
}
39+
defer s.Close(context.Background())
40+
41+
resp, err := http.Get("http://localhost:" + testPort + livenessPath)
42+
if err != nil {
43+
t.Fatalf("HTTP GET failed: %v", err)
44+
}
45+
if resp.StatusCode != http.StatusOK {
46+
t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusOK)
47+
}
48+
}
49+
50+
// Test to verify that when startup has NOT finished, the startup and readiness endpoints write
51+
// http.StatusServiceUnavailable.
52+
func TestStartupFail(t *testing.T) {
53+
s, err := healthcheck.NewServer(&proxy.Client{}, testPort)
54+
if err != nil {
55+
t.Fatalf("Could not initialize health check: %v\n", err)
56+
}
57+
defer s.Close(context.Background())
58+
59+
resp, err := http.Get("http://localhost:" + testPort + startupPath)
60+
if err != nil {
61+
t.Fatalf("HTTP GET failed: %v\n", err)
62+
}
63+
if resp.StatusCode != http.StatusServiceUnavailable {
64+
t.Errorf("%v returned status code %v instead of %v", startupPath, resp.StatusCode, http.StatusServiceUnavailable)
65+
}
66+
67+
resp, err = http.Get("http://localhost:" + testPort + readinessPath)
68+
if err != nil {
69+
t.Fatalf("HTTP GET failed: %v\n", err)
70+
}
71+
if resp.StatusCode != http.StatusServiceUnavailable {
72+
t.Errorf("%v returned status code %v instead of %v", readinessPath, resp.StatusCode, http.StatusServiceUnavailable)
73+
}
74+
}
75+
76+
// Test to verify that when startup HAS finished (and MaxConnections limit not specified),
77+
// the startup and readiness endpoints write http.StatusOK.
78+
func TestStartupPass(t *testing.T) {
79+
s, err := healthcheck.NewServer(&proxy.Client{}, testPort)
80+
if err != nil {
81+
t.Fatalf("Could not initialize health check: %v\n", err)
82+
}
83+
defer s.Close(context.Background())
84+
85+
// Simulate the proxy client completing startup.
86+
s.NotifyStarted()
87+
88+
resp, err := http.Get("http://localhost:" + testPort + startupPath)
89+
if err != nil {
90+
t.Fatalf("HTTP GET failed: %v\n", err)
91+
}
92+
if resp.StatusCode != http.StatusOK {
93+
t.Errorf("%v returned status code %v instead of %v", startupPath, resp.StatusCode, http.StatusOK)
94+
}
95+
96+
resp, err = http.Get("http://localhost:" + testPort + readinessPath)
97+
if err != nil {
98+
t.Fatalf("HTTP GET failed: %v\n", err)
99+
}
100+
if resp.StatusCode != http.StatusOK {
101+
t.Errorf("%v returned status code %v instead of %v", readinessPath, resp.StatusCode, http.StatusOK)
102+
}
103+
}
104+
105+
// Test to verify that when startup has finished, but MaxConnections has been reached,
106+
// the readiness endpoint writes http.StatusServiceUnavailable.
107+
func TestMaxConnectionsReached(t *testing.T) {
108+
c := &proxy.Client{
109+
MaxConnections: 1,
110+
}
111+
s, err := healthcheck.NewServer(c, testPort)
112+
if err != nil {
113+
t.Fatalf("Could not initialize health check: %v", err)
114+
}
115+
defer s.Close(context.Background())
116+
117+
s.NotifyStarted()
118+
c.ConnectionsCounter = c.MaxConnections // Simulate reaching the limit for maximum number of connections
119+
120+
resp, err := http.Get("http://localhost:" + testPort + readinessPath)
121+
if err != nil {
122+
t.Fatalf("HTTP GET failed: %v", err)
123+
}
124+
if resp.StatusCode != http.StatusServiceUnavailable {
125+
t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusServiceUnavailable)
126+
}
127+
}
128+
129+
// Test to verify that after closing a healthcheck, its liveness endpoint serves
130+
// an error.
131+
func TestCloseHealthCheck(t *testing.T) {
132+
s, err := healthcheck.NewServer(&proxy.Client{}, testPort)
133+
if err != nil {
134+
t.Fatalf("Could not initialize health check: %v", err)
135+
}
136+
defer s.Close(context.Background())
137+
138+
resp, err := http.Get("http://localhost:" + testPort + livenessPath)
139+
if err != nil {
140+
t.Fatalf("HTTP GET failed: %v", err)
141+
}
142+
if resp.StatusCode != http.StatusOK {
143+
t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusOK)
144+
}
145+
146+
err = s.Close(context.Background())
147+
if err != nil {
148+
t.Fatalf("Failed to close health check: %v", err)
149+
}
150+
151+
_, err = http.Get("http://localhost:" + testPort + livenessPath)
152+
if err == nil {
153+
t.Fatalf("HTTP GET did not return error after closing health check server.")
154+
}
155+
}

0 commit comments

Comments
 (0)