stablekernel · joshua-temple · Jun 17, 2026 · Jun 17, 2026
diff --git a/e2e/harness/act.go b/e2e/harness/act.go
@@ -8,11 +8,13 @@ import (
 	"fmt"
 	"io"
 	"strings"
+	"time"
 
 	"github.com/moby/moby/api/pkg/stdcopy"
 	"github.com/moby/moby/api/types/container"
 	"github.com/moby/moby/api/types/mount"
 	"github.com/testcontainers/testcontainers-go"
+	tcexec "github.com/testcontainers/testcontainers-go/exec"
 	"github.com/testcontainers/testcontainers-go/wait"
 )
 
@@ -69,14 +71,12 @@ func NewActRunner(ctx context.Context, giteaURL, giteaToken, networkName string,
 		return nil, fmt.Errorf("failed to start act container: %w", err)
 	}
 
-	// Install act in the container
-	_, _, err = container.Exec(ctx, []string{
-		"bash", "-c",
-		"curl -s https://raw.githubusercontent.com/nektos/act/master/install.sh | bash -s -- -b /usr/local/bin",
-	})
-	if err != nil {
+	// Install act in the container, verifying the binary actually lands and
+	// retrying the transient network hiccups that the install script is prone
+	// to. The real *DockerContainer satisfies containerExecer directly.
+	if err := installActWithVerify(ctx, container, actInstallMaxAttempts); err != nil {
 		_ = container.Terminate(ctx) // Best-effort cleanup
-		return nil, fmt.Errorf("failed to install act: %w", err)
+		return nil, err
 	}
 
 	// Network override is passed on the CLI as `--network=<name>`. Act's
@@ -112,6 +112,94 @@ EOF`
 	}, nil
 }
 
+// containerExecer is the slice of the container surface the act install needs:
+// running a command and getting back its exit code plus an output reader. The
+// real *DockerContainer satisfies it, and a fake satisfies it in tests, so the
+// install/verify/retry logic is exercisable without Docker or the network.
+type containerExecer interface {
+	Exec(ctx context.Context, cmd []string, options ...tcexec.ProcessOption) (int, io.Reader, error)
+}
+
+// actInstallMaxAttempts bounds how many times we try the act install before
+// giving up. The install failure mode is a transient network hiccup (registry
+// rate-limit, a dropped connection while fetching the install script), which a
+// clean retry reliably clears, so a small fixed budget is enough.
+const actInstallMaxAttempts = 3
+
+// actInstallBackoff returns the pause before the retry that follows a failed
+// install attempt (attempt is 1-based). It grows 2s then 4s so a momentary
+// network blip has time to clear without stalling the suite. It is a var so
+// tests can zero the wait and stay fast.
+var actInstallBackoff = func(attempt int) time.Duration {
+	return time.Duration(attempt*2) * time.Second
+}
+
+// installActWithVerify installs the act binary into the container and confirms
+// it is actually runnable, retrying transient failures up to maxAttempts.
+//
+// This exists because container Exec returns a non-nil error ONLY for a Docker
+// API/transport failure, NOT when the executed command exits non-zero. The
+// original install discarded the exit code, so a failed `curl | bash` (a
+// registry rate-limit or a dropped connection) looked like success: the runner
+// came back "healthy" and the first workflow run died with
+// `bash: act: command not found`. Here we check the exit code, then prove the
+// binary resolves with `command -v act`, and only trust an attempt that clears
+// both gates. A bounded retry with backoff absorbs the transient hiccup that is
+// the usual cause; the final failure surfaces the failing command output so the
+// real reason is not lost.
+func installActWithVerify(ctx context.Context, exec containerExecer, maxAttempts int) error {
+	const installScript = "curl -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | bash -s -- -b /usr/local/bin"
+
+	var lastErr error
+	for attempt := 1; attempt <= maxAttempts; attempt++ {
+		if attempt > 1 {
+			select {
+			case <-ctx.Done():
+				return fmt.Errorf("act install cancelled after %d attempt(s): %w", attempt-1, ctx.Err())
+			case <-time.After(actInstallBackoff(attempt - 1)):
+			}
+		}
+
+		installCode, installReader, err := exec.Exec(ctx, []string{"bash", "-c", installScript})
+		if err != nil {
+			lastErr = fmt.Errorf("act install exec failed: %w", err)
+			continue
+		}
+		if installCode != 0 {
+			lastErr = fmt.Errorf("act install exited with code %d: %s", installCode, readExecOutput(installReader))
+			continue
+		}
+
+		verifyCode, verifyReader, err := exec.Exec(ctx, []string{"bash", "-c", "command -v act"})
+		if err != nil {
+			lastErr = fmt.Errorf("act verify exec failed: %w", err)
+			continue
+		}
+		if verifyCode != 0 {
+			lastErr = fmt.Errorf("act binary not resolvable after install (command -v act exited %d): %s", verifyCode, readExecOutput(verifyReader))
+			continue
+		}
+
+		return nil
+	}
+
+	return fmt.Errorf("failed to install act after %d attempt(s): %w", maxAttempts, lastErr)
+}
+
+// readExecOutput drains an exec output reader into a trimmed string for error
+// messages, tolerating a nil reader. Output here is short (a curl error line or
+// a `command -v` result), so reading it whole is fine.
+func readExecOutput(reader io.Reader) string {
+	if reader == nil {
+		return ""
+	}
+	out, err := io.ReadAll(reader)
+	if err != nil {
+		return fmt.Sprintf("(failed to read command output: %v)", err)
+	}
+	return strings.TrimSpace(string(out))
+}
+
 // Container returns the underlying testcontainers.Container
 func (a *ActRunner) Container() testcontainers.Container {
 	return a.container

diff --git a/e2e/harness/act_test.go b/e2e/harness/act_test.go
@@ -4,13 +4,15 @@ import (
 	"bytes"
 	"context"
 	"encoding/binary"
+	"io"
 	"strings"
 	"testing"
 	"time"
 
 	"github.com/moby/moby/api/pkg/stdcopy"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
+	tcexec "github.com/testcontainers/testcontainers-go/exec"
 )
 
 // frameDockerStream builds a single Docker hijacked-attach frame: an 8-byte
@@ -399,3 +401,118 @@ func TestNormalizeWorkflowResult(t *testing.T) {
 		})
 	}
 }
+
+// scriptedExec is a per-call response the fake execer hands back: an exit code
+// and the text the returned reader yields. The install path keys its decisions
+// off both, so the fake must control them independently.
+type scriptedExec struct {
+	exitCode int
+	output   string
+}
+
+// fakeExecer is a containerExecer that scripts responses without Docker. It
+// classifies each call as an install (the curl|bash command) or a verify
+// (`command -v act`) by inspecting the command string, and pops the next
+// scripted response from the matching queue. This lets the tests drive the
+// transient-failure and binary-missing paths deterministically.
+type fakeExecer struct {
+	installResponses []scriptedExec
+	verifyResponses  []scriptedExec
+	installCalls     int
+	verifyCalls      int
+}
+
+func (f *fakeExecer) Exec(_ context.Context, cmd []string, _ ...tcexec.ProcessOption) (int, io.Reader, error) {
+	joined := strings.Join(cmd, " ")
+	switch {
+	case strings.Contains(joined, "command -v act"):
+		resp := f.verifyResponses[f.verifyCalls]
+		f.verifyCalls++
+		return resp.exitCode, strings.NewReader(resp.output), nil
+	default:
+		resp := f.installResponses[f.installCalls]
+		f.installCalls++
+		return resp.exitCode, strings.NewReader(resp.output), nil
+	}
+}
+
+// noInstallBackoff zeroes the install retry pause for the duration of a test so
+// the retry path runs instantly. These install tests therefore do not call
+// t.Parallel: they share the package-level backoff var.
+func noInstallBackoff(t *testing.T) {
+	t.Helper()
+	prev := actInstallBackoff
+	actInstallBackoff = func(int) time.Duration { return 0 }
+	t.Cleanup(func() { actInstallBackoff = prev })
+}
+
+// TestInstallActWithVerify_TransientInstallFailureThenSuccess proves the install
+// path survives a transient non-zero curl|bash exit: the first attempt fails
+// (the silent-non-zero-exit footgun this code closes), and a clean retry
+// installs and verifies act. Before the fix the discarded exit code let a failed
+// install masquerade as success, so the first real workflow run died with
+// `bash: act: command not found`.
+func TestInstallActWithVerify_TransientInstallFailureThenSuccess(t *testing.T) {
+	noInstallBackoff(t)
+
+	fake := &fakeExecer{
+		installResponses: []scriptedExec{
+			{exitCode: 1, output: "curl: (28) timed out"},
+			{exitCode: 0, output: ""},
+		},
+		verifyResponses: []scriptedExec{
+			{exitCode: 0, output: "/usr/local/bin/act"},
+		},
+	}
+
+	err := installActWithVerify(context.Background(), fake, 3)
+	require.NoError(t, err)
+	assert.Equal(t, 2, fake.installCalls, "install should be retried once after the transient failure")
+	assert.Equal(t, 1, fake.verifyCalls, "verify runs once, after the successful install")
+}
+
+// TestInstallActWithVerify_BinaryMissingThenRecovers proves the install path
+// does not trust a zero-exit install alone. The first attempt's install exits 0
+// but `command -v act` cannot resolve the binary (a partial or interrupted
+// install), so the attempt is rejected and retried; the second attempt both
+// installs and verifies cleanly.
+func TestInstallActWithVerify_BinaryMissingThenRecovers(t *testing.T) {
+	noInstallBackoff(t)
+
+	fake := &fakeExecer{
+		installResponses: []scriptedExec{
+			{exitCode: 0, output: ""},
+			{exitCode: 0, output: ""},
+		},
+		verifyResponses: []scriptedExec{
+			{exitCode: 1, output: ""},
+			{exitCode: 0, output: "/usr/local/bin/act"},
+		},
+	}
+
+	err := installActWithVerify(context.Background(), fake, 3)
+	require.NoError(t, err)
+	assert.Equal(t, 2, fake.installCalls, "install should be retried after the binary check failed")
+	assert.Equal(t, 2, fake.verifyCalls, "verify runs once per attempt")
+}
+
+// TestInstallActWithVerify_AllAttemptsFail proves the bounded retry gives up and
+// returns an error (rather than a falsely healthy runner) when every attempt
+// fails, and that the failing command output is surfaced for debugging.
+func TestInstallActWithVerify_AllAttemptsFail(t *testing.T) {
+	noInstallBackoff(t)
+
+	fake := &fakeExecer{
+		installResponses: []scriptedExec{
+			{exitCode: 1, output: "curl: (6) could not resolve host"},
+			{exitCode: 1, output: "curl: (6) could not resolve host"},
+			{exitCode: 1, output: "curl: (6) could not resolve host"},
+		},
+		verifyResponses: []scriptedExec{},
+	}
+
+	err := installActWithVerify(context.Background(), fake, 3)
+	require.Error(t, err)
+	assert.Equal(t, 3, fake.installCalls, "all attempts should be exhausted")
+	assert.Contains(t, err.Error(), "could not resolve host", "failing command output should be surfaced")
+}