Refactor handler lifecycle management with timeouts and auto-recovery (#9)

stringintech · web-flow · commit e3408471c8aa · 2025-12-02T22:46:17.000+03:30
Refactors handler process management to prevent test suite hangs and enable reliable execution:

- Add `--handler-timeout` flag for max response wait per test case
- Add `--timeout` flag for total execution time limit
- Auto-respawn crashed/broken handlers to allow remaining tests to continue
- Clean handler shutdown with force-kill fallback
- Introduce re-exec pattern for unit testing handler behaviors
diff --git a/Makefile b/Makefile
@@ -20,7 +20,7 @@ mock-handler:
 
 test:
 	@echo "Running runner unit tests..."
-	go test ./runner/...
+	go test -v ./runner/...
 	@echo "Running conformance tests with mock handler..."
 	$(RUNNER_BIN) -handler $(MOCK_HANDLER_BIN)
 
diff --git a/README.md b/README.md
@@ -52,8 +52,20 @@ make runner
 
 # Run the test runner against your handler binary
 ./build/runner --handler <path-to-your-handler>
+
+# Configure timeouts (optional)
+./build/runner --handler <path-to-your-handler> \
+  --handler-timeout 30s \  # Max wait per test case (default: 10s)
+  --timeout 2m             # Total execution limit (default: 30s)
 ```
 
+#### Timeout Flags
+
+- **`--handler-timeout`** (default: 10s): Maximum time to wait for handler response to each test case. Prevents hangs on unresponsive handlers.
+- **`--timeout`** (default: 30s): Total execution time limit across all test suites. Ensures bounded test runs.
+
+The runner automatically detects and recovers from crashed/unresponsive handlers, allowing remaining tests to continue.
+
 ### Testing the Runner
 
 Build and test the runner:
diff --git a/cmd/runner/main.go b/cmd/runner/main.go
@@ -1,18 +1,22 @@
 package main
 
 import (
+	"context"
 	"flag"
 	"fmt"
 	"io/fs"
 	"os"
 	"strings"
+	"time"
 
 	"github.com/stringintech/kernel-bindings-tests/runner"
 	"github.com/stringintech/kernel-bindings-tests/testdata"
 )
 
 func main() {
 	handlerPath := flag.String("handler", "", "Path to handler binary")
+	handlerTimeout := flag.Duration("handler-timeout", 10*time.Second, "Max time to wait for handler to respond to each test case (e.g., 10s, 500ms)")
+	timeout := flag.Duration("timeout", 30*time.Second, "Total timeout for executing all test suites (e.g., 30s, 1m)")
 	flag.Parse()
 
 	if *handlerPath == "" {
@@ -33,6 +37,18 @@ func main() {
 		os.Exit(1)
 	}
 
+	// Create test runner
+	testRunner, err := runner.NewTestRunner(*handlerPath, *handlerTimeout, *timeout)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error creating test runner: %v\n", err)
+		os.Exit(1)
+	}
+	defer testRunner.CloseHandler()
+
+	// Create context with total execution timeout
+	ctx, cancel := context.WithTimeout(context.Background(), *timeout)
+	defer cancel()
+
 	// Run tests
 	totalPassed := 0
 	totalFailed := 0
@@ -48,17 +64,8 @@ func main() {
 			continue
 		}
 
-		// Create test runner
-		testRunner, err := runner.NewTestRunner(*handlerPath)
-		if err != nil {
-			fmt.Fprintf(os.Stderr, "Error creating test runner: %v\n", err)
-			continue
-		}
-
 		// Run suite
-		result := testRunner.RunTestSuite(*suite)
-		testRunner.Close()
-
+		result := testRunner.RunTestSuite(ctx, *suite)
 		printResults(suite, result)
 
 		totalPassed += result.PassedTests
diff --git a/runner/handler.go b/runner/handler.go
@@ -0,0 +1,158 @@
+package runner
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"os/exec"
+	"time"
+)
+
+var (
+	// ErrHandlerTimeout indicates the handler did not respond within the timeout
+	ErrHandlerTimeout = errors.New("handler timeout")
+	// ErrHandlerClosed indicates the handler closed stdout unexpectedly
+	ErrHandlerClosed = errors.New("handler closed unexpectedly")
+)
+
+// HandlerConfig configures a handler process
+type HandlerConfig struct {
+	Path string
+	Args []string
+	Env  []string
+	// Timeout specifies the maximum duration to wait when reading from the handler's
+	// stdout. If zero, defaults to 10 seconds. The handler is killed if it fails to
+	// write output within this timeout.
+	Timeout time.Duration
+}
+
+// Handler manages a conformance handler process communicating via stdin/stdout
+type Handler struct {
+	cmd     *exec.Cmd
+	stdin   io.WriteCloser
+	stdout  *bufio.Scanner
+	stderr  io.ReadCloser
+	timeout time.Duration
+}
+
+// NewHandler spawns a new handler process with the given configuration
+func NewHandler(cfg *HandlerConfig) (*Handler, error) {
+	cmd := exec.Command(cfg.Path, cfg.Args...)
+	if cfg.Env != nil {
+		cmd.Env = append(cmd.Environ(), cfg.Env...)
+	}
+
+	stdin, err := cmd.StdinPipe()
+	if err != nil {
+		return nil, fmt.Errorf("failed to create stdin pipe: %w", err)
+	}
+
+	stdout, err := cmd.StdoutPipe()
+	if err != nil {
+		return nil, fmt.Errorf("failed to create stdout pipe: %w", err)
+	}
+
+	stderr, err := cmd.StderrPipe()
+	if err != nil {
+		return nil, fmt.Errorf("failed to create stderr pipe: %w", err)
+	}
+
+	// Start() automatically closes all pipes on failure, no manual cleanup needed
+	if err := cmd.Start(); err != nil {
+		return nil, fmt.Errorf("failed to start handler: %w", err)
+	}
+
+	timeout := cfg.Timeout
+	if timeout == 0 {
+		timeout = 10 * time.Second
+	}
+
+	return &Handler{
+		cmd:     cmd,
+		stdin:   stdin,
+		stdout:  bufio.NewScanner(stdout),
+		stderr:  stderr,
+		timeout: timeout,
+	}, nil
+}
+
+// SendLine writes a line to the handler's stdin
+func (h *Handler) SendLine(line []byte) error {
+	_, err := h.stdin.Write(append(line, '\n'))
+	return err
+}
+
+// ReadLine reads a line from the handler's stdout with a configurable timeout
+func (h *Handler) ReadLine() ([]byte, error) {
+	// Use a timeout for Scan() in case the handler hangs
+	scanDone := make(chan bool, 1)
+	go func() {
+		scanDone <- h.stdout.Scan()
+	}()
+
+	var baseErr error
+	select {
+	case ok := <-scanDone:
+		if ok {
+			return h.stdout.Bytes(), nil
+		}
+		if err := h.stdout.Err(); err != nil {
+			return nil, err
+		}
+		// EOF - handler closed stdout prematurely, fall through to kill and capture stderr
+		baseErr = ErrHandlerClosed
+	case <-time.After(h.timeout):
+		// Timeout - handler didn't respond, fall through to kill and capture stderr
+		baseErr = ErrHandlerTimeout
+	}
+
+	// Kill the process immediately to force stderr to close.
+	// Without this, there's a rare scenario where stdout closes but stderr remains open,
+	// causing io.ReadAll(h.stderr) below to block indefinitely waiting for stderr EOF.
+	if h.cmd.Process != nil {
+		h.cmd.Process.Kill()
+	}
+
+	// Capture stderr to provide diagnostic information when the handler fails.
+	if stderrOut, err := io.ReadAll(h.stderr); err == nil && len(stderrOut) > 0 {
+		return nil, fmt.Errorf("%w: %s", baseErr, bytes.TrimSpace(stderrOut))
+	}
+	return nil, baseErr
+}
+
+// Close closes stdin and waits for the handler to exit with a 5-second timeout.
+// If the handler doesn't exit within the timeout, it is killed.
+func (h *Handler) Close() {
+	if h.stdin != nil {
+		// Close stdin to signal the handler that we're done sending requests.
+		// Per the handler specification, the handler should exit cleanly when stdin closes.
+		h.stdin.Close()
+	}
+	if h.cmd != nil {
+		// Wait for the handler to exit cleanly in response to stdin closing.
+		// Wait() automatically closes all remaining pipes after the process exits.
+		// Use a timeout in case the handler doesn't respect the protocol.
+		done := make(chan error, 1)
+		go func() {
+			done <- h.cmd.Wait()
+		}()
+
+		select {
+		case err := <-done:
+			if err != nil {
+				slog.Warn("Handler exit with error", "error", err)
+			}
+		case <-time.After(5 * time.Second):
+			slog.Warn("Handler did not exit within a 5-second timeout, killing process")
+			if h.cmd.Process != nil {
+				h.cmd.Process.Kill()
+				// Call Wait() again to let the process finish cleanup (closing pipes, etc.)
+				// No timeout needed since Kill() should guarantee the process will exit
+				h.cmd.Wait()
+			}
+		}
+	}
+}
diff --git a/runner/handler_test.go b/runner/handler_test.go
diff --git a/runner/runner.go b/runner/runner.go