@@ -1800,6 +1800,18 @@ func (st *buildStatus) build() error {
1800
1800
} else if err != nil {
1801
1801
doneMsg = "comm error: " + err .Error ()
1802
1802
}
1803
+ // If a build fails multiple times due to communication
1804
+ // problems with the buildlet, assume something's wrong with
1805
+ // the buildlet or machine and fail the build, rather than
1806
+ // looping forever. This promotes the err (communication
1807
+ // error) to a remoteErr (an error that occurred remotely and
1808
+ // is terminal).
1809
+ if rerr := st .repeatedCommunicationError (err ); rerr != nil {
1810
+ remoteErr = rerr
1811
+ err = nil
1812
+ doneMsg = "communication error to buildlet (promoted to terminal error): " + rerr .Error ()
1813
+ fmt .Fprintf (st , "\n %s\n " , doneMsg )
1814
+ }
1803
1815
if err != nil {
1804
1816
// Return the error *before* we create the magic
1805
1817
// "done" event. (which the try coordinator looks for)
@@ -1950,6 +1962,10 @@ func (st *buildStatus) runAllSharded() (remoteErr, err error) {
1950
1962
remoteErr , err = st .runTests (st .getHelpers ())
1951
1963
}
1952
1964
1965
+ if err == errBuildletsGone {
1966
+ // Don't wrap this error. TODO: use xerrors.
1967
+ return nil , errBuildletsGone
1968
+ }
1953
1969
if err != nil {
1954
1970
return nil , fmt .Errorf ("runTests: %v" , err )
1955
1971
}
@@ -2496,6 +2512,8 @@ func (ts *trySet) affectedPkgs() (pkgs []string) {
2496
2512
return
2497
2513
}
2498
2514
2515
+ var errBuildletsGone = errors .New ("runTests: dist test failed: all buildlets had network errors or timeouts, yet tests remain" )
2516
+
2499
2517
// runTests is only called for builders which support a split make/run
2500
2518
// (should be everything, at least soon). Currently (2015-05-27) iOS
2501
2519
// and Android and Nacl do not.
@@ -2630,7 +2648,7 @@ func (st *buildStatus) runTests(helpers <-chan *buildlet.Client) (remoteErr, err
2630
2648
st .LogEventTime ("still_waiting_on_test" , ti .name )
2631
2649
case <- buildletsGone :
2632
2650
set .cancelAll ()
2633
- return nil , fmt . Errorf ( "dist test failed: all buildlets had network errors or timeouts, yet tests remain" )
2651
+ return nil , errBuildletsGone
2634
2652
}
2635
2653
}
2636
2654
@@ -3289,6 +3307,26 @@ func (st *buildStatus) Write(p []byte) (n int, err error) {
3289
3307
return st .output .Write (p )
3290
3308
}
3291
3309
3310
+ // repeatedCommunicationError takes a buildlet execution error (a
3311
+ // network/communication error, as opposed to a remote execution that
3312
+ // ran and had a non-zero exit status and we heard about) and
3313
+ // conditionally promotes it to a terminal error. If this returns a
3314
+ // non-nil value, the execErr should be considered terminal with the
3315
+ // returned error.
3316
+ func (st * buildStatus ) repeatedCommunicationError (execErr error ) error {
3317
+ if execErr == nil {
3318
+ return nil
3319
+ }
3320
+ // For now, only do this for plan9, which is flaky (Issue 31261)
3321
+ if strings .HasPrefix (st .Name , "plan9-" ) && execErr == errBuildletsGone {
3322
+ // TODO: give it two tries at least later (store state
3323
+ // somewhere; global map?). But for now we're going to
3324
+ // only give it one try.
3325
+ return fmt .Errorf ("network error promoted to terminal error: %v" , execErr )
3326
+ }
3327
+ return nil
3328
+ }
3329
+
3292
3330
func useGitMirror () bool {
3293
3331
return * mode != "dev"
3294
3332
}
0 commit comments