Skip to content

Commit 9eec085

Browse files
committed
cmd/coordinator: don't let plan9 buildlets retry forever on heartbeat timeout
Quick fix to stop wasting resources for now. Updates golang/go#31261 Change-Id: Ia95988197313e5a8c7db3d8c557a8c7dd24b93ef Reviewed-on: https://go-review.googlesource.com/c/build/+/170781 Reviewed-by: Dmitri Shuralyov <[email protected]>
1 parent 4e29d67 commit 9eec085

File tree

1 file changed

+39
-1
lines changed

1 file changed

+39
-1
lines changed

cmd/coordinator/coordinator.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1800,6 +1800,18 @@ func (st *buildStatus) build() error {
18001800
} else if err != nil {
18011801
doneMsg = "comm error: " + err.Error()
18021802
}
1803+
// If a build fails multiple times due to communication
1804+
// problems with the buildlet, assume something's wrong with
1805+
// the buildlet or machine and fail the build, rather than
1806+
// looping forever. This promotes the err (communication
1807+
// error) to a remoteErr (an error that occurred remotely and
1808+
// is terminal).
1809+
if rerr := st.repeatedCommunicationError(err); rerr != nil {
1810+
remoteErr = rerr
1811+
err = nil
1812+
doneMsg = "communication error to buildlet (promoted to terminal error): " + rerr.Error()
1813+
fmt.Fprintf(st, "\n%s\n", doneMsg)
1814+
}
18031815
if err != nil {
18041816
// Return the error *before* we create the magic
18051817
// "done" event. (which the try coordinator looks for)
@@ -1950,6 +1962,10 @@ func (st *buildStatus) runAllSharded() (remoteErr, err error) {
19501962
remoteErr, err = st.runTests(st.getHelpers())
19511963
}
19521964

1965+
if err == errBuildletsGone {
1966+
// Don't wrap this error. TODO: use xerrors.
1967+
return nil, errBuildletsGone
1968+
}
19531969
if err != nil {
19541970
return nil, fmt.Errorf("runTests: %v", err)
19551971
}
@@ -2496,6 +2512,8 @@ func (ts *trySet) affectedPkgs() (pkgs []string) {
24962512
return
24972513
}
24982514

2515+
var errBuildletsGone = errors.New("runTests: dist test failed: all buildlets had network errors or timeouts, yet tests remain")
2516+
24992517
// runTests is only called for builders which support a split make/run
25002518
// (should be everything, at least soon). Currently (2015-05-27) iOS
25012519
// and Android and Nacl do not.
@@ -2630,7 +2648,7 @@ func (st *buildStatus) runTests(helpers <-chan *buildlet.Client) (remoteErr, err
26302648
st.LogEventTime("still_waiting_on_test", ti.name)
26312649
case <-buildletsGone:
26322650
set.cancelAll()
2633-
return nil, fmt.Errorf("dist test failed: all buildlets had network errors or timeouts, yet tests remain")
2651+
return nil, errBuildletsGone
26342652
}
26352653
}
26362654

@@ -3289,6 +3307,26 @@ func (st *buildStatus) Write(p []byte) (n int, err error) {
32893307
return st.output.Write(p)
32903308
}
32913309

3310+
// repeatedCommunicationError takes a buildlet execution error (a
3311+
// network/communication error, as opposed to a remote execution that
3312+
// ran and had a non-zero exit status and we heard about) and
3313+
// conditionally promotes it to a terminal error. If this returns a
3314+
// non-nil value, the execErr should be considered terminal with the
3315+
// returned error.
3316+
func (st *buildStatus) repeatedCommunicationError(execErr error) error {
3317+
if execErr == nil {
3318+
return nil
3319+
}
3320+
// For now, only do this for plan9, which is flaky (Issue 31261)
3321+
if strings.HasPrefix(st.Name, "plan9-") && execErr == errBuildletsGone {
3322+
// TODO: give it two tries at least later (store state
3323+
// somewhere; global map?). But for now we're going to
3324+
// only give it one try.
3325+
return fmt.Errorf("network error promoted to terminal error: %v", execErr)
3326+
}
3327+
return nil
3328+
}
3329+
32923330
func useGitMirror() bool {
32933331
return *mode != "dev"
32943332
}

0 commit comments

Comments
 (0)