Skip to content

Commit 75882ed

Browse files
committed
cmd/runqemubuildlet: restart unresponsive qemu processes
Expose the healthz port from the buildlet running under QEMU, and periodically check it for a successful response. If it has been failing for longer than ten minutes, try to restart the VM. This should successfully restart VMs that failed to boot, failed to shut down, or are otherwise unresponsive. For golang/go#47018 Change-Id: I9218f94ee24de6e0a56ad60a18e075ce48893938 Reviewed-on: https://go-review.googlesource.com/c/build/+/336109 Trust: Alexander Rakoczy <[email protected]> Run-TryBot: Alexander Rakoczy <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Dmitri Shuralyov <[email protected]> Reviewed-by: Carlos Amedee <[email protected]>
1 parent 8cdc394 commit 75882ed

File tree

3 files changed

+186
-11
lines changed

3 files changed

+186
-11
lines changed

cmd/runqemubuildlet/heartbeat.go

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build go1.16
6+
// +build go1.16
7+
8+
package main
9+
10+
import (
11+
"context"
12+
"fmt"
13+
"io"
14+
"net/http"
15+
"time"
16+
17+
"golang.org/x/build/internal"
18+
)
19+
20+
// buildletHealthTimeout is the maximum time to wait for a
21+
// checkBuildletHealth request to complete.
22+
const buildletHealthTimeout = 10 * time.Second
23+
24+
// checkBuildletHealth performs a GET request against URL, and returns
25+
// an error if an http.StatusOK isn't returned before
26+
// buildletHealthTimeout has elapsed.
27+
func checkBuildletHealth(ctx context.Context, url string) error {
28+
ctx, cancel := context.WithTimeout(ctx, buildletHealthTimeout)
29+
defer cancel()
30+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
31+
if err != nil {
32+
return err
33+
}
34+
resp, err := http.DefaultClient.Do(req)
35+
if err != nil {
36+
return err
37+
}
38+
defer resp.Body.Close()
39+
if _, err := io.Copy(io.Discard, resp.Body); err != nil {
40+
return err
41+
}
42+
if resp.StatusCode != http.StatusOK {
43+
return fmt.Errorf("resp.StatusCode = %d, wanted %d", resp.StatusCode, http.StatusOK)
44+
}
45+
return nil
46+
}
47+
48+
// heartbeatContext calls f every period. If f consistently returns an
49+
// error for longer than the provided timeout duration, the context
50+
// returned by heartbeatContext will be cancelled, and
51+
// heartbeatContext will stop sending requests.
52+
//
53+
// A single call to f that does not return an error will reset the
54+
// timeout window, unless heartbeatContext has already timed out.
55+
func heartbeatContext(ctx context.Context, period time.Duration, timeout time.Duration, f func(context.Context) error) (context.Context, func()) {
56+
ctx, cancel := context.WithCancel(ctx)
57+
58+
lastSuccess := time.Now()
59+
go internal.PeriodicallyDo(ctx, period, func(ctx context.Context, t time.Time) {
60+
err := f(ctx)
61+
if err != nil && t.Sub(lastSuccess) > timeout {
62+
cancel()
63+
}
64+
if err == nil {
65+
lastSuccess = t
66+
}
67+
})
68+
69+
return ctx, cancel
70+
}

cmd/runqemubuildlet/heartbeat_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// Copyright 2021 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build go1.16
6+
// +build go1.16
7+
8+
package main
9+
10+
import (
11+
"context"
12+
"errors"
13+
"fmt"
14+
"net/http"
15+
"net/http/httptest"
16+
"net/url"
17+
"testing"
18+
"time"
19+
)
20+
21+
func TestCheckBuildletHealth(t *testing.T) {
22+
cases := []struct {
23+
desc string
24+
respCode int
25+
wantErr bool
26+
}{
27+
{
28+
desc: "success",
29+
respCode: http.StatusOK,
30+
},
31+
{
32+
desc: "failure",
33+
respCode: http.StatusBadGateway,
34+
wantErr: true,
35+
},
36+
}
37+
for _, c := range cases {
38+
t.Run(c.desc, func(t *testing.T) {
39+
m := http.NewServeMux()
40+
m.HandleFunc("/healthz", func(w http.ResponseWriter, req *http.Request) {
41+
w.WriteHeader(c.respCode)
42+
fmt.Sprintln(w, "ok")
43+
})
44+
s := httptest.NewServer(m)
45+
defer s.Close()
46+
u, err := url.Parse(s.URL)
47+
if err != nil {
48+
t.Fatalf("url.Parse(%q) = %v, wanted no error", s.URL, err)
49+
}
50+
u.Path = "/healthz"
51+
52+
if err := checkBuildletHealth(context.Background(), u.String()); (err != nil) != c.wantErr {
53+
t.Errorf("checkBuildletHealth(_, %q) = %v, wantErr: %t", s.URL, err, c.wantErr)
54+
}
55+
})
56+
}
57+
}
58+
59+
func TestHeartbeatContext(t *testing.T) {
60+
ctx := context.Background()
61+
62+
didWork := make(chan interface{}, 2)
63+
done := make(chan interface{})
64+
ctx, cancel := heartbeatContext(ctx, time.Millisecond, 100*time.Millisecond, func(context.Context) error {
65+
select {
66+
case <-done:
67+
return errors.New("heartbeat stopped")
68+
case didWork <- nil:
69+
default:
70+
}
71+
return nil
72+
})
73+
defer cancel()
74+
75+
select {
76+
case <-time.After(5 * time.Second):
77+
t.Errorf("heatbeatContext() never called f, wanted at least one call")
78+
case <-didWork:
79+
}
80+
81+
select {
82+
case <-done:
83+
t.Errorf("heartbeatContext() finished early, wanted it to still be testing")
84+
case <-didWork:
85+
close(done)
86+
}
87+
88+
select {
89+
case <-time.After(5 * time.Second):
90+
t.Errorf("heartbeatContext() did not timeout, wanted timeout after failing over %v", time.Second)
91+
case <-ctx.Done():
92+
// heartbeatContext() successfully timed out after failing
93+
}
94+
}

cmd/runqemubuildlet/main.go

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
var (
2626
windows10Path = flag.String("windows-10-path", defaultWindowsDir(), "Path to Windows image and QEMU dependencies.")
27+
healthzURL = flag.String("buildlet-healthz-url", "http://localhost:8080/healthz", "URL to buildlet /healthz endpoint.")
2728
)
2829

2930
func main() {
@@ -33,22 +34,32 @@ func main() {
3334
defer stop()
3435

3536
for ctx.Err() == nil {
36-
cmd := windows10Cmd(*windows10Path)
37-
log.Printf("Starting VM: %s", cmd.String())
38-
cmd.Stdout = os.Stdout
39-
cmd.Stderr = os.Stderr
40-
if err := cmd.Start(); err != nil {
41-
log.Printf("cmd.Start() = %v. Retrying in 10 seconds.", err)
37+
if err := runWindows10(ctx); err != nil {
38+
log.Printf("runWindows10() = %v. Retrying in 10 seconds.", err)
4239
time.Sleep(10 * time.Second)
4340
continue
4441
}
45-
if err := internal.WaitOrStop(ctx, cmd, os.Interrupt, time.Minute); err != nil {
46-
log.Printf("waitOrStop(_, %v, %v, %v) = %v. Retrying in 10 seconds.", cmd, os.Interrupt, time.Minute, err)
47-
time.Sleep(10 * time.Second)
48-
}
4942
}
5043
}
5144

45+
func runWindows10(ctx context.Context) error {
46+
cmd := windows10Cmd(*windows10Path)
47+
log.Printf("Starting VM: %s", cmd.String())
48+
cmd.Stdout = os.Stdout
49+
cmd.Stderr = os.Stderr
50+
if err := cmd.Start(); err != nil {
51+
return fmt.Errorf("cmd.Start() = %w", err)
52+
}
53+
ctx, cancel := heartbeatContext(ctx, 30*time.Second, 10*time.Minute, func(ctx context.Context) error {
54+
return checkBuildletHealth(ctx, *healthzURL)
55+
})
56+
defer cancel()
57+
if err := internal.WaitOrStop(ctx, cmd, os.Interrupt, time.Minute); err != nil {
58+
return fmt.Errorf("WaitOrStop(_, %v, %v, %v) = %w", cmd, os.Interrupt, time.Minute, err)
59+
}
60+
return nil
61+
}
62+
5263
// defaultWindowsDir returns a default path for a Windows VM.
5364
//
5465
// The directory should contain the Windows VM image, and UTM
@@ -81,7 +92,7 @@ func windows10Cmd(base string) *exec.Cmd {
8192
"-device", "usb-mouse,bus=usb-bus.0",
8293
"-device", "usb-kbd,bus=usb-bus.0",
8394
"-device", "virtio-net-pci,netdev=net0",
84-
"-netdev", "user,id=net0",
95+
"-netdev", "user,id=net0,hostfwd=tcp::8080-:8080",
8596
"-bios", filepath.Join(base, "Images/QEMU_EFI.fd"),
8697
"-device", "nvme,drive=drive0,serial=drive0,bootindex=0",
8798
"-drive", fmt.Sprintf("if=none,media=disk,id=drive0,file=%s,cache=writethrough", filepath.Join(base, "Images/win10.qcow2")),

0 commit comments

Comments
 (0)