cmd/go: use Cancel and WaitDelay to terminate test subprocesses

Bryan C. Mills · gopherbot · commit caf8fb1e828d · 2023-01-19T20:45:56.000Z
Updates #50436. Updates #56163. Fixes #24050. Change-Id: I1b00eb8fb60e0879f029642b5bad97b2e139fee6 Reviewed-on: https://go-review.googlesource.com/c/go/+/456116 Reviewed-by: Ian Lance Taylor <iant@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Bryan Mills <bcmills@google.com> Auto-Submit: Bryan Mills <bcmills@google.com>
diff --git a/src/cmd/go/internal/test/test.go b/src/cmd/go/internal/test/test.go
@@ -1276,7 +1276,12 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		}
 	}
 
-	cmd := exec.Command(args[0], args[1:]...)
+	// Normally, the test will terminate itself when the timeout expires,
+	// but add a last-ditch deadline to detect and stop wedged binaries.
+	ctx, cancel := context.WithTimeout(ctx, testKillTimeout)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, args[0], args[1:]...)
 	cmd.Dir = a.Package.Dir
 
 	env := cfg.OrigEnv[:len(cfg.OrigEnv):len(cfg.OrigEnv)]
@@ -1309,42 +1314,33 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		cmd.Env = env
 	}
 
-	base.StartSigHandlers()
-	t0 := time.Now()
-	err = cmd.Start()
-
-	// This is a last-ditch deadline to detect and
-	// stop wedged test binaries, to keep the builders
-	// running.
-	if err == nil {
-		tick := time.NewTimer(testKillTimeout)
-		done := make(chan error)
-		go func() {
-			done <- cmd.Wait()
-		}()
-	Outer:
-		select {
-		case err = <-done:
-			// ok
-		case <-tick.C:
-			if base.SignalTrace != nil {
-				// Send a quit signal in the hope that the program will print
-				// a stack trace and exit. Give it five seconds before resorting
-				// to Kill.
-				cmd.Process.Signal(base.SignalTrace)
-				select {
-				case err = <-done:
-					fmt.Fprintf(cmd.Stdout, "*** Test killed with %v: ran too long (%v).\n", base.SignalTrace, testKillTimeout)
-					break Outer
-				case <-time.After(5 * time.Second):
-				}
+	var (
+		cancelKilled   = false
+		cancelSignaled = false
+	)
+	cmd.Cancel = func() error {
+		if base.SignalTrace == nil {
+			err := cmd.Process.Kill()
+			if err == nil {
+				cancelKilled = true
 			}
-			cmd.Process.Kill()
-			err = <-done
-			fmt.Fprintf(cmd.Stdout, "*** Test killed: ran too long (%v).\n", testKillTimeout)
+			return err
 		}
-		tick.Stop()
+
+		// Send a quit signal in the hope that the program will print
+		// a stack trace and exit.
+		err := cmd.Process.Signal(base.SignalTrace)
+		if err == nil {
+			cancelSignaled = true
+		}
+		return err
 	}
+	// Give the test five seconds to exit after the signal before resorting to Kill.
+	cmd.WaitDelay = 5 * time.Second
+
+	base.StartSigHandlers()
+	t0 := time.Now()
+	err = cmd.Run()
 	out := buf.Bytes()
 	a.TestOutput = &buf
 	t := fmt.Sprintf("%.3fs", time.Since(t0).Seconds())
@@ -1374,7 +1370,13 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		r.c.saveOutput(a)
 	} else {
 		base.SetExitStatus(1)
-		if len(out) == 0 {
+		if cancelSignaled {
+			fmt.Fprintf(cmd.Stdout, "*** Test killed with %v: ran too long (%v).\n", base.SignalTrace, testKillTimeout)
+		} else if cancelKilled {
+			fmt.Fprintf(cmd.Stdout, "*** Test killed: ran too long (%v).\n", testKillTimeout)
+		}
+		var ee *exec.ExitError
+		if len(out) == 0 || !errors.As(err, &ee) || !ee.Exited() {
 			// If there was no test output, print the exit status so that the reason
 			// for failure is clear.
 			fmt.Fprintf(cmd.Stdout, "%s\n", err)
diff --git a/src/cmd/go/testdata/script/test_timeout_stdin.txt b/src/cmd/go/testdata/script/test_timeout_stdin.txt
@@ -0,0 +1,88 @@
+# Regression test for https://go.dev/issue/24050:
+# a test that exits with an I/O stream held open
+# should fail after a reasonable delay, not wait forever.
+# (As of the time of writing, that delay is 5 seconds,
+# but this test does not depend on its specific value.)
+
+[short] skip 'runs a test that hangs until its WaitDelay expires'
+
+! go test -v -timeout=1m .
+
+	# After the test process itself prints PASS and exits,
+	# the kernel closes its stdin pipe to to the orphaned subprocess.
+	# At that point, we expect the subprocess to print 'stdin closed'
+	# and periodically log to stderr until the WaitDelay expires.
+	#
+	# Once the WaitDelay expires, the copying goroutine for 'go test' stops and
+	# closes the read side of the stderr pipe, and the subprocess will eventually
+	# exit due to a failed write to that pipe.
+
+stdout '^--- PASS: TestOrphanCmd .*\nPASS\nstdin closed'
+stdout '^exec: WaitDelay expired before I/O complete\nFAIL\s+example\s+\d+(\.\d+)?s'
+
+-- go.mod --
+module example
+
+go 1.20
+-- main_test.go --
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"os/exec"
+	"testing"
+	"time"
+)
+
+func TestMain(m *testing.M) {
+	if os.Getenv("TEST_TIMEOUT_HANG") == "1" {
+		io.Copy(io.Discard, os.Stdin)
+		if _, err := os.Stderr.WriteString("stdin closed\n"); err != nil {
+			os.Exit(1)
+		}
+
+		ticker := time.NewTicker(100 * time.Millisecond)
+		for t := range ticker.C {
+			_, err := fmt.Fprintf(os.Stderr, "still alive at %v\n", t)
+			if err != nil {
+				os.Exit(1)
+			}
+		}
+	}
+
+	m.Run()
+}
+
+func TestOrphanCmd(t *testing.T) {
+	exe, err := os.Executable()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	cmd := exec.Command(exe)
+	cmd.Env = append(cmd.Environ(), "TEST_TIMEOUT_HANG=1")
+
+	// Hold stdin open until this (parent) process exits.
+	if _, err := cmd.StdinPipe(); err != nil {
+		t.Fatal(err)
+	}
+
+	// Forward stderr to the subprocess so that it can hold the stream open.
+	cmd.Stderr = os.Stderr
+
+	if err := cmd.Start(); err != nil {
+		t.Fatal(err)
+	}
+	t.Logf("started %v", cmd)
+
+	// Intentionally leak cmd when the test completes.
+	// This will allow the test process itself to exit, but (at least on Unix
+	// platforms) will keep the parent process's stderr stream open.
+	go func() {
+		if err := cmd.Wait(); err != nil {
+			os.Exit(3)
+		}
+	}()
+}