From e232a0890d55ea97c681cbb7a24e201b4baa5fff Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Sun, 14 Jun 2026 19:39:34 -0400
Subject: [PATCH 1/2] test(e2e): seed callback workflows before generation for
 validate and failing-rollback scenarios

PR #161 removed the inline run callback emission path and converted two scenarios to reusable workflows, but staged the referenced workflow bodies only via a step commit.files, which lands after generation runs. The harness seeds callback stubs and runs generation once at setup, so validate.yaml was missing at generation time and the failing rollback deploy used the generic non-failing stub. PR #162 narrowed the transient classifier and surfaced both.

Add a setup_workflows map on MultiStepScenario seeded into the setup commit before generation, and a validate workflow_call stub clause keyed on config.validate.workflow. Move the failing deploy-app.yaml into setup_workflows so the rollback re-deploy fails under the Rollback caller, and rely on the seeded validate stub so orchestrate.yaml generation emits the gate.

Test-infra only; no product behavior change.

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/harness.go                        | 57 ++++++++++++++-
 e2e/harness/multistep.go                      | 12 ++++
 e2e/harness/scenario_retry.go                 |  2 +-
 e2e/scenarios/17-validate-callback.yaml       | 24 ++-----
 ...ollback-failed-deploy-no-state-change.yaml | 72 +++++++++++--------
 5 files changed, 115 insertions(+), 52 deletions(-)

diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go
index 3d567d8..ea71519 100644
--- a/e2e/harness/harness.go
+++ b/e2e/harness/harness.go
@@ -97,7 +97,15 @@ func (h *Harness) SetupInfra(ctx context.Context) error {
 }
 
 // StageRepoFromConfig creates a repo with the given config for multi-step scenarios
-func (h *Harness) StageRepoFromConfig(ctx context.Context, config Config) error {
+// StageRepoFromConfig creates the test repo, writes the manifest and the stub
+// callback workflows derived from config, then runs workflow generation. The
+// optional setupWorkflows map seeds additional reusable callback workflow files
+// (keyed by repository path) into the same setup commit before generation, and
+// overrides any auto-generated stub at the same path. Scenarios use it to supply
+// a callback body the generic stub cannot express (for example a deploy that
+// fails only under the Rollback workflow); files staged via a later step's
+// commit.files would land after generation and never be read by the generator.
+func (h *Harness) StageRepoFromConfig(ctx context.Context, config Config, setupWorkflows map[string]string) error {
 	var err error
 
 	// Create repo
@@ -159,6 +167,25 @@ func (h *Harness) StageRepoFromConfig(ctx context.Context, config Config) error
 				files[p] = generateChangelogStubWorkflow(scenarioTag)
 			}
 		}
+		// A top-level validate callback is a reusable workflow the generated
+		// orchestrate.yaml invokes as a job-level uses:. Stub it so the generator
+		// can read the referenced workflow at generation time and emit the validate
+		// gate. Without a seeded stub the generator fails reading validate.yaml,
+		// since the file would otherwise only arrive via a later step commit.
+		if wf, ok := config.Validate["workflow"].(string); ok && wf != "" {
+			if p := normalizeCallbackStubPath(wf); p != "" {
+				files[p] = generateValidateStubWorkflow(scenarioTag)
+			}
+		}
+
+		// Seed scenario-supplied reusable callback workflows last so they override
+		// any auto-generated stub at the same path. These bodies express behavior
+		// the generic stub cannot (for example a deploy that exits non-zero only
+		// under the Rollback workflow) and must be present before generation reads
+		// the referenced workflows.
+		for path, body := range setupWorkflows {
+			files[path] = body
+		}
 
 		// Create mock setup-cli action that installs CLI from repo
 		// The generated workflows reference stablekernel/cascade/.github/actions/setup-cli
@@ -418,6 +445,34 @@ jobs:
 `, displayName)
 }
 
+// generateValidateStubWorkflow returns a reusable workflow_call stub for a
+// top-level validate callback. The generated orchestrate.yaml invokes it as a
+// job-level uses: and gates the build jobs on needs.validate.result, so the stub
+// declares the environment/sha inputs the generator threads and an inner job
+// that always succeeds, giving the gate a real job to wait on.
+func generateValidateStubWorkflow(scenarioTag string) string {
+	displayName := "validate"
+	if scenarioTag != "" {
+		displayName = fmt.Sprintf("validate [scenario-%s]", scenarioTag)
+	}
+	return fmt.Sprintf(`name: %s
+on:
+  workflow_call:
+    inputs:
+      environment:
+        required: false
+        type: string
+      sha:
+        required: false
+        type: string
+jobs:
+  runvalidate:
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "validate"
+`, displayName)
+}
+
 // GenerateWorkflows generates GitHub Actions workflows from cicd-config.yaml
 func (h *Harness) GenerateWorkflows(ctx context.Context) error {
 	if h.repo == nil {
diff --git a/e2e/harness/multistep.go b/e2e/harness/multistep.go
index e25f730..b792781 100644
--- a/e2e/harness/multistep.go
+++ b/e2e/harness/multistep.go
@@ -15,6 +15,18 @@ type MultiStepScenario struct {
 	Config      Config      `yaml:"config"`
 	Setup       *SetupState `yaml:"setup,omitempty"` // Optional initial state
 	Steps       []Step      `yaml:"steps"`
+	// SetupWorkflows seeds reusable callback workflow files into the setup commit
+	// BEFORE workflow generation runs, keyed by repository path (for example
+	// ".github/workflows/deploy-app.yaml"). The harness generates a generic
+	// non-failing stub for every build/deploy callback, which is sufficient for
+	// most scenarios. A scenario that needs a callback to behave differently (for
+	// example a deploy that exits non-zero only under the Rollback workflow) sets
+	// the exact reusable-workflow body here so it is present on disk when the
+	// generator reads the referenced workflow. A staged file overrides any
+	// auto-generated stub at the same path. These files are harness-side only and
+	// are never written into the generated manifest. Staging them via a step's
+	// commit.files would land after generation, so they would be invisible to it.
+	SetupWorkflows map[string]string `yaml:"setup_workflows,omitempty"`
 }
 
 // SetupState defines optional initial state for the scenario
diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go
index c73935f..8a62e9c 100644
--- a/e2e/harness/scenario_retry.go
+++ b/e2e/harness/scenario_retry.go
@@ -185,7 +185,7 @@ func RunMultiStepScenario(ctx context.Context, t *testing.T, scenario *MultiStep
 		if err := h.SetupInfra(ctx); err != nil {
 			return fmt.Errorf("failed to setup infrastructure: %w", err)
 		}
-		if err := h.StageRepoFromConfig(ctx, scenario.Config); err != nil {
+		if err := h.StageRepoFromConfig(ctx, scenario.Config, scenario.SetupWorkflows); err != nil {
 			return fmt.Errorf("failed to stage repo: %w", err)
 		}
 
diff --git a/e2e/scenarios/17-validate-callback.yaml b/e2e/scenarios/17-validate-callback.yaml
index c922b08..eb0be16 100644
--- a/e2e/scenarios/17-validate-callback.yaml
+++ b/e2e/scenarios/17-validate-callback.yaml
@@ -6,6 +6,11 @@ description: |
 
   Generator-output verification only.
 
+  The validate callback (validate.yaml) is a reusable workflow the generator
+  reads at generation time to discover its inputs and emit the gate. The harness
+  seeds a workflow_call stub for it from config.validate.workflow before
+  generation runs, so the referenced file is on disk when the generator reads it.
+
 config:
   trunk_branch: main
   environments: [dev]
@@ -29,25 +34,6 @@ steps:
         src/app.go: |
           package main
           func main() {}
-        # Reusable validate callback the generated orchestrate.yaml invokes as a
-        # uses: job. Its inner job echoes "validate" so the gate has a real job to
-        # wait on; the orchestrate build jobs gate on needs.validate.result.
-        .github/workflows/validate.yaml: |
-          name: validate
-          on:
-            workflow_call:
-              inputs:
-                environment:
-                  required: false
-                  type: string
-                sha:
-                  required: false
-                  type: string
-          jobs:
-            runvalidate:
-              runs-on: ubuntu-latest
-              steps:
-                - run: echo "validate"
     expect:
       workflow_files:
         - path: ".github/workflows/orchestrate.yaml"
diff --git a/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml b/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
index 6b5ee97..1bbfa2a 100644
--- a/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
+++ b/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
@@ -47,6 +47,47 @@ config:
       workflow: .github/workflows/deploy-app.yaml
       triggers: ["**"]
 
+# Seed the failing deploy callback into the setup commit BEFORE generation, so
+# the generator reads this body (not the generic harness stub) when it resolves
+# deploy-app.yaml. Staging it via step 1's commit.files would land after
+# generation and the generator would never see it, leaving the generic stub that
+# always succeeds in place and the rollback re-deploy would not fail. The inner
+# job is appdeploy (matched by expect.jobs below). It succeeds while the caller
+# workflow is Promote and exits non-zero only when the caller is Rollback, keyed
+# on $GITHUB_WORKFLOW, which inside a reusable callback is the caller's name.
+setup_workflows:
+  .github/workflows/deploy-app.yaml: |
+    name: deploy-app
+    on:
+      workflow_call:
+        inputs:
+          environment:
+            required: false
+            type: string
+          sha:
+            required: false
+            type: string
+    jobs:
+      appdeploy:
+        runs-on: ubuntu-latest
+        steps:
+          - env:
+              DEPLOY_ENV: ${{ inputs.environment }}
+              DEPLOY_SHA: ${{ inputs.sha }}
+            run: |
+              echo "deploy of env=$DEPLOY_ENV sha=$DEPLOY_SHA via workflow=$GITHUB_WORKFLOW"
+              # GITHUB_WORKFLOW inside a reusable callback is the caller
+              # workflow's name. The harness suffixes each with
+              # [scenario-<tag>], so the Rollback workflow surfaces it as
+              # "Rollback [scenario-...]". Match the Rollback prefix.
+              case "$GITHUB_WORKFLOW" in
+                Rollback*)
+                  echo "failing the rollback re-deploy on purpose"
+                  exit 1
+                  ;;
+              esac
+              echo "promote deploy succeeded"
+
 steps:
   - name: "Commit the first version source"
     action: commit
@@ -56,37 +97,6 @@ steps:
         src/app.go: |
           package main
           func main() {}
-        .github/workflows/deploy-app.yaml: |
-          name: deploy-app
-          on:
-            workflow_call:
-              inputs:
-                environment:
-                  required: false
-                  type: string
-                sha:
-                  required: false
-                  type: string
-          jobs:
-            appdeploy:
-              runs-on: ubuntu-latest
-              steps:
-                - env:
-                    DEPLOY_ENV: ${{ inputs.environment }}
-                    DEPLOY_SHA: ${{ inputs.sha }}
-                  run: |
-                    echo "deploy of env=$DEPLOY_ENV sha=$DEPLOY_SHA via workflow=$GITHUB_WORKFLOW"
-                    # GITHUB_WORKFLOW inside a reusable callback is the caller
-                    # workflow's name. The harness suffixes each with
-                    # [scenario-<tag>], so the Rollback workflow surfaces it as
-                    # "Rollback [scenario-...]". Match the Rollback prefix.
-                    case "$GITHUB_WORKFLOW" in
-                      Rollback*)
-                        echo "failing the rollback re-deploy on purpose"
-                        exit 1
-                        ;;
-                    esac
-                    echo "promote deploy succeeded"
 
   - name: "Orchestrate the first commit into dev"
     action: orchestrate

From 2a275cf14d61c3cd39574975845d2fa05457f304 Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Sun, 14 Jun 2026 19:43:44 -0400
Subject: [PATCH 2/2] test(e2e): fail rollback re-deploy via dispatch env not
 caller workflow name

Inside a reusable workflow_call callback, $GITHUB_WORKFLOW is the callee's own name, not the caller's, on act and on real GitHub. The converted rollback scenario keyed its failing deploy on the caller name (Rollback*), which never matches, so the re-deploy always succeeded and the state-unchanged assertion broke. Both promote and rollback also dispatch via workflow_dispatch, so the event name cannot disambiguate either. Set CASCADE_E2E_ROLLBACK on the rollback dispatch only and key the deploy callback on it; act passes top-level env into the reusable callee.

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/rollback_actions.go               |  8 +++
 ...ollback-failed-deploy-no-state-change.yaml | 56 ++++++++++---------
 2 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/e2e/harness/rollback_actions.go b/e2e/harness/rollback_actions.go
index a242cd4..d60c1c8 100644
--- a/e2e/harness/rollback_actions.go
+++ b/e2e/harness/rollback_actions.go
@@ -67,6 +67,14 @@ func (r *Runner) executeRollback(ctx context.Context, rollback *RollbackStep, co
 		Env: map[string]string{
 			"GITHUB_REF":        fmt.Sprintf("refs/heads/%s", branch),
 			"GITHUB_REPOSITORY": fmt.Sprintf("%s/%s", AdminUsername, r.harness.repo.Name),
+			// Mark this run as a rollback so a deploy callback can distinguish the
+			// rollback re-deploy from a setup promote. Both are dispatched via
+			// workflow_dispatch and a reusable callback sees its own name in
+			// $GITHUB_WORKFLOW (the callee's, not the caller's), so the caller's
+			// workflow name is not a usable signal. act passes top-level --env into
+			// every job, including reusable callees, so this is observable in the
+			// deploy step. Only the rollback path sets it; promote never does.
+			"CASCADE_E2E_ROLLBACK": "1",
 		},
 	})
 	if err != nil {
diff --git a/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml b/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
index 1bbfa2a..5442d80 100644
--- a/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
+++ b/e2e/scenarios/rollback/rollback-failed-deploy-no-state-change.yaml
@@ -7,9 +7,9 @@ description: |
   succeed, leaving trunk state exactly where it was.
 
   prod is advanced through two published versions, then a rollback is requested.
-  The inline deploy succeeds while it is invoked by the Promote workflow (so the
-  two setup promotions land prod on commit1 then commit2), and exits non-zero
-  only when invoked by the Rollback workflow's re-deploy. finalize then sees
+  The deploy succeeds while it is invoked by the Promote workflow (so the two
+  setup promotions land prod on commit1 then commit2), and exits non-zero only
+  when invoked by the Rollback workflow's re-deploy. finalize then sees
   DEPLOY_RESULT_APP=failure, aborts before writing, and the run concludes in
   failure. prod state therefore stays on the second version: no rolled-back SHA,
   no rollback ref. This is the safety property that distinguishes a real
@@ -17,13 +17,14 @@ description: |
 
   The deploy is a reusable workflow whose inner job runs under act (no actions/
   checkout, which act cannot resolve for a reusable callback against the
-  per-scenario gitea), and is asserted as appdeploy: failure. The deploy keys its
-  conditional failure on $GITHUB_WORKFLOW, which inside a reusable callback is the
-  invoking (caller) workflow's name: the generated promote workflow sets
-  name: Promote and the rollback workflow sets name: Rollback, and the harness
-  suffixes each with [scenario-...], so the deploy matches the Rollback prefix.
-  The Promote workflow succeeds, only the Rollback re-deploy fails, exercising the
-  gateOnDeployResults guard rather than aborting during setup.
+  per-scenario gitea), and is asserted as appdeploy: failure. Inside a reusable
+  workflow_call callback, $GITHUB_WORKFLOW is the callee's own name, not the
+  caller's, on act and on real GitHub alike, so the caller workflow name cannot
+  tell the rollback re-deploy from a setup promote (both also run under
+  workflow_dispatch). The harness instead sets CASCADE_E2E_ROLLBACK=1 only on the
+  rollback dispatch, and act passes that top-level env into the reusable callee,
+  so the deploy fails only on the rollback re-deploy. The two setup promotes
+  succeed, exercising the gateOnDeployResults guard rather than aborting setup.
 
 config:
   trunk_branch: main
@@ -37,12 +38,12 @@ config:
     # Rollback workflow's re-deploy. Both workflows thread the same with: inputs
     # (environment, sha) into this callback, and the rollback target SHA equals an
     # earlier promote's SHA, so the SHA alone cannot tell the two apart. The
-    # invoking workflow name ($GITHUB_WORKFLOW, the caller's name inside a reusable
-    # callback) does: it is "Promote" during the two setup promotions and
-    # "Rollback" during the re-deploy under test. Failing only on Rollback forces
-    # appdeploy to fail there, so finalize sees DEPLOY_RESULT_APP=failure and
-    # aborts the state write, while setup lands prod on commit1 then commit2
-    # cleanly. act keys it by the inner job id appdeploy.
+    # callee cannot read the caller's workflow name ($GITHUB_WORKFLOW resolves to
+    # the callee inside a reusable callback), so the harness sets the
+    # CASCADE_E2E_ROLLBACK env only on the rollback dispatch. Failing only when it
+    # is set forces appdeploy to fail on the rollback re-deploy, so finalize sees
+    # DEPLOY_RESULT_APP=failure and aborts the state write, while setup lands prod
+    # on commit1 then commit2 cleanly. act keys it by the inner job id appdeploy.
     - name: app
       workflow: .github/workflows/deploy-app.yaml
       triggers: ["**"]
@@ -75,17 +76,18 @@ setup_workflows:
               DEPLOY_ENV: ${{ inputs.environment }}
               DEPLOY_SHA: ${{ inputs.sha }}
             run: |
-              echo "deploy of env=$DEPLOY_ENV sha=$DEPLOY_SHA via workflow=$GITHUB_WORKFLOW"
-              # GITHUB_WORKFLOW inside a reusable callback is the caller
-              # workflow's name. The harness suffixes each with
-              # [scenario-<tag>], so the Rollback workflow surfaces it as
-              # "Rollback [scenario-...]". Match the Rollback prefix.
-              case "$GITHUB_WORKFLOW" in
-                Rollback*)
-                  echo "failing the rollback re-deploy on purpose"
-                  exit 1
-                  ;;
-              esac
+              echo "deploy of env=$DEPLOY_ENV sha=$DEPLOY_SHA rollback=${CASCADE_E2E_ROLLBACK:-0}"
+              # Inside a reusable workflow_call callback, $GITHUB_WORKFLOW is the
+              # callee's own name (deploy-app), not the caller's, on act and on
+              # real GitHub alike, so the caller name cannot tell rollback from
+              # promote. The harness instead sets CASCADE_E2E_ROLLBACK=1 only on
+              # the rollback dispatch (act passes top-level --env into reusable
+              # callees), so the rollback re-deploy fails while the two setup
+              # promotes succeed.
+              if [ "${CASCADE_E2E_ROLLBACK:-0}" = "1" ]; then
+                echo "failing the rollback re-deploy on purpose"
+                exit 1
+              fi
               echo "promote deploy succeeded"
 
 steps: