diff --git a/e2e/harness/act.go b/e2e/harness/act.go index c76e3db..b69af48 100644 --- a/e2e/harness/act.go +++ b/e2e/harness/act.go @@ -8,11 +8,13 @@ import ( "fmt" "io" "strings" + "time" "github.com/moby/moby/api/pkg/stdcopy" "github.com/moby/moby/api/types/container" "github.com/moby/moby/api/types/mount" "github.com/testcontainers/testcontainers-go" + tcexec "github.com/testcontainers/testcontainers-go/exec" "github.com/testcontainers/testcontainers-go/wait" ) @@ -69,14 +71,12 @@ func NewActRunner(ctx context.Context, giteaURL, giteaToken, networkName string, return nil, fmt.Errorf("failed to start act container: %w", err) } - // Install act in the container - _, _, err = container.Exec(ctx, []string{ - "bash", "-c", - "curl -s https://raw.githubusercontent.com/nektos/act/master/install.sh | bash -s -- -b /usr/local/bin", - }) - if err != nil { + // Install act in the container, verifying the binary actually lands and + // retrying the transient network hiccups that the install script is prone + // to. The real *DockerContainer satisfies containerExecer directly. + if err := installActWithVerify(ctx, container, actInstallMaxAttempts); err != nil { _ = container.Terminate(ctx) // Best-effort cleanup - return nil, fmt.Errorf("failed to install act: %w", err) + return nil, err } // Network override is passed on the CLI as `--network=`. Act's @@ -112,6 +112,94 @@ EOF` }, nil } +// containerExecer is the slice of the container surface the act install needs: +// running a command and getting back its exit code plus an output reader. The +// real *DockerContainer satisfies it, and a fake satisfies it in tests, so the +// install/verify/retry logic is exercisable without Docker or the network. +type containerExecer interface { + Exec(ctx context.Context, cmd []string, options ...tcexec.ProcessOption) (int, io.Reader, error) +} + +// actInstallMaxAttempts bounds how many times we try the act install before +// giving up. The install failure mode is a transient network hiccup (registry +// rate-limit, a dropped connection while fetching the install script), which a +// clean retry reliably clears, so a small fixed budget is enough. +const actInstallMaxAttempts = 3 + +// actInstallBackoff returns the pause before the retry that follows a failed +// install attempt (attempt is 1-based). It grows 2s then 4s so a momentary +// network blip has time to clear without stalling the suite. It is a var so +// tests can zero the wait and stay fast. +var actInstallBackoff = func(attempt int) time.Duration { + return time.Duration(attempt*2) * time.Second +} + +// installActWithVerify installs the act binary into the container and confirms +// it is actually runnable, retrying transient failures up to maxAttempts. +// +// This exists because container Exec returns a non-nil error ONLY for a Docker +// API/transport failure, NOT when the executed command exits non-zero. The +// original install discarded the exit code, so a failed `curl | bash` (a +// registry rate-limit or a dropped connection) looked like success: the runner +// came back "healthy" and the first workflow run died with +// `bash: act: command not found`. Here we check the exit code, then prove the +// binary resolves with `command -v act`, and only trust an attempt that clears +// both gates. A bounded retry with backoff absorbs the transient hiccup that is +// the usual cause; the final failure surfaces the failing command output so the +// real reason is not lost. +func installActWithVerify(ctx context.Context, exec containerExecer, maxAttempts int) error { + const installScript = "curl -sSf https://raw.githubusercontent.com/nektos/act/master/install.sh | bash -s -- -b /usr/local/bin" + + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + if attempt > 1 { + select { + case <-ctx.Done(): + return fmt.Errorf("act install cancelled after %d attempt(s): %w", attempt-1, ctx.Err()) + case <-time.After(actInstallBackoff(attempt - 1)): + } + } + + installCode, installReader, err := exec.Exec(ctx, []string{"bash", "-c", installScript}) + if err != nil { + lastErr = fmt.Errorf("act install exec failed: %w", err) + continue + } + if installCode != 0 { + lastErr = fmt.Errorf("act install exited with code %d: %s", installCode, readExecOutput(installReader)) + continue + } + + verifyCode, verifyReader, err := exec.Exec(ctx, []string{"bash", "-c", "command -v act"}) + if err != nil { + lastErr = fmt.Errorf("act verify exec failed: %w", err) + continue + } + if verifyCode != 0 { + lastErr = fmt.Errorf("act binary not resolvable after install (command -v act exited %d): %s", verifyCode, readExecOutput(verifyReader)) + continue + } + + return nil + } + + return fmt.Errorf("failed to install act after %d attempt(s): %w", maxAttempts, lastErr) +} + +// readExecOutput drains an exec output reader into a trimmed string for error +// messages, tolerating a nil reader. Output here is short (a curl error line or +// a `command -v` result), so reading it whole is fine. +func readExecOutput(reader io.Reader) string { + if reader == nil { + return "" + } + out, err := io.ReadAll(reader) + if err != nil { + return fmt.Sprintf("(failed to read command output: %v)", err) + } + return strings.TrimSpace(string(out)) +} + // Container returns the underlying testcontainers.Container func (a *ActRunner) Container() testcontainers.Container { return a.container diff --git a/e2e/harness/act_test.go b/e2e/harness/act_test.go index 16fc155..04763f6 100644 --- a/e2e/harness/act_test.go +++ b/e2e/harness/act_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/binary" + "io" "strings" "testing" "time" @@ -11,6 +12,7 @@ import ( "github.com/moby/moby/api/pkg/stdcopy" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + tcexec "github.com/testcontainers/testcontainers-go/exec" ) // frameDockerStream builds a single Docker hijacked-attach frame: an 8-byte @@ -399,3 +401,118 @@ func TestNormalizeWorkflowResult(t *testing.T) { }) } } + +// scriptedExec is a per-call response the fake execer hands back: an exit code +// and the text the returned reader yields. The install path keys its decisions +// off both, so the fake must control them independently. +type scriptedExec struct { + exitCode int + output string +} + +// fakeExecer is a containerExecer that scripts responses without Docker. It +// classifies each call as an install (the curl|bash command) or a verify +// (`command -v act`) by inspecting the command string, and pops the next +// scripted response from the matching queue. This lets the tests drive the +// transient-failure and binary-missing paths deterministically. +type fakeExecer struct { + installResponses []scriptedExec + verifyResponses []scriptedExec + installCalls int + verifyCalls int +} + +func (f *fakeExecer) Exec(_ context.Context, cmd []string, _ ...tcexec.ProcessOption) (int, io.Reader, error) { + joined := strings.Join(cmd, " ") + switch { + case strings.Contains(joined, "command -v act"): + resp := f.verifyResponses[f.verifyCalls] + f.verifyCalls++ + return resp.exitCode, strings.NewReader(resp.output), nil + default: + resp := f.installResponses[f.installCalls] + f.installCalls++ + return resp.exitCode, strings.NewReader(resp.output), nil + } +} + +// noInstallBackoff zeroes the install retry pause for the duration of a test so +// the retry path runs instantly. These install tests therefore do not call +// t.Parallel: they share the package-level backoff var. +func noInstallBackoff(t *testing.T) { + t.Helper() + prev := actInstallBackoff + actInstallBackoff = func(int) time.Duration { return 0 } + t.Cleanup(func() { actInstallBackoff = prev }) +} + +// TestInstallActWithVerify_TransientInstallFailureThenSuccess proves the install +// path survives a transient non-zero curl|bash exit: the first attempt fails +// (the silent-non-zero-exit footgun this code closes), and a clean retry +// installs and verifies act. Before the fix the discarded exit code let a failed +// install masquerade as success, so the first real workflow run died with +// `bash: act: command not found`. +func TestInstallActWithVerify_TransientInstallFailureThenSuccess(t *testing.T) { + noInstallBackoff(t) + + fake := &fakeExecer{ + installResponses: []scriptedExec{ + {exitCode: 1, output: "curl: (28) timed out"}, + {exitCode: 0, output: ""}, + }, + verifyResponses: []scriptedExec{ + {exitCode: 0, output: "/usr/local/bin/act"}, + }, + } + + err := installActWithVerify(context.Background(), fake, 3) + require.NoError(t, err) + assert.Equal(t, 2, fake.installCalls, "install should be retried once after the transient failure") + assert.Equal(t, 1, fake.verifyCalls, "verify runs once, after the successful install") +} + +// TestInstallActWithVerify_BinaryMissingThenRecovers proves the install path +// does not trust a zero-exit install alone. The first attempt's install exits 0 +// but `command -v act` cannot resolve the binary (a partial or interrupted +// install), so the attempt is rejected and retried; the second attempt both +// installs and verifies cleanly. +func TestInstallActWithVerify_BinaryMissingThenRecovers(t *testing.T) { + noInstallBackoff(t) + + fake := &fakeExecer{ + installResponses: []scriptedExec{ + {exitCode: 0, output: ""}, + {exitCode: 0, output: ""}, + }, + verifyResponses: []scriptedExec{ + {exitCode: 1, output: ""}, + {exitCode: 0, output: "/usr/local/bin/act"}, + }, + } + + err := installActWithVerify(context.Background(), fake, 3) + require.NoError(t, err) + assert.Equal(t, 2, fake.installCalls, "install should be retried after the binary check failed") + assert.Equal(t, 2, fake.verifyCalls, "verify runs once per attempt") +} + +// TestInstallActWithVerify_AllAttemptsFail proves the bounded retry gives up and +// returns an error (rather than a falsely healthy runner) when every attempt +// fails, and that the failing command output is surfaced for debugging. +func TestInstallActWithVerify_AllAttemptsFail(t *testing.T) { + noInstallBackoff(t) + + fake := &fakeExecer{ + installResponses: []scriptedExec{ + {exitCode: 1, output: "curl: (6) could not resolve host"}, + {exitCode: 1, output: "curl: (6) could not resolve host"}, + {exitCode: 1, output: "curl: (6) could not resolve host"}, + }, + verifyResponses: []scriptedExec{}, + } + + err := installActWithVerify(context.Background(), fake, 3) + require.Error(t, err) + assert.Equal(t, 3, fake.installCalls, "all attempts should be exhausted") + assert.Contains(t, err.Error(), "could not resolve host", "failing command output should be surfaced") +}