stablekernel · joshua-temple · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.github/actions/dispatch-suite/action.yaml b/.github/actions/dispatch-suite/action.yaml
@@ -0,0 +1,102 @@
+name: 'Dispatch scenario suite'
+description: >-
+  Dispatch a downstream cascade-example repo's scenario-suite.yaml on its own
+  main, recover the run id it created, and watch that run to its conclusion.
+
+inputs:
+  repo:
+    description: 'Target repo slug, e.g. stablekernel/cascade-example-primary'
+    required: true
+  token:
+    description: >-
+      PAT with Actions read/write on the target repo. GITHUB_TOKEN cannot
+      dispatch cross-repo, so a fleet-wide fine-grained PAT is mandatory.
+    required: true
+  workflow:
+    description: 'Workflow file to dispatch in the target repo'
+    required: false
+    default: 'scenario-suite.yaml'
+  ref:
+    description: "Target ref to dispatch against (must be the target's default branch)"
+    required: false
+    default: 'main'
+  recover-attempts:
+    description: 'How many times to poll for the dispatched run before giving up'
+    required: false
+    default: '30'
+  recover-interval:
+    description: 'Seconds between recovery polls'
+    required: false
+    default: '10'
+
+runs:
+  using: 'composite'
+  steps:
+    # Reconciliation, dispatch -> recover -> watch, with zero target-side change.
+    #
+    # Cross-repo workflow_dispatch returns 204 with no run id (CONFIRMED in the
+    # pattern research), so we cannot await the run we just created directly. We
+    # recover it by listing the target's scenario-suite runs created at/after the
+    # dispatch timestamp (event = workflow_dispatch) and taking the newest. This
+    # is the current approach; a future refinement could echo a distinct_id
+    # marker into the suite run-name for race-free recovery once the suites
+    # carry one.
+    - name: Dispatch and watch
+      shell: bash
+      env:
+        GH_TOKEN: ${{ inputs.token }}
+        TARGET_REPO: ${{ inputs.repo }}
+        TARGET_WORKFLOW: ${{ inputs.workflow }}
+        TARGET_REF: ${{ inputs.ref }}
+        RECOVER_ATTEMPTS: ${{ inputs.recover-attempts }}
+        RECOVER_INTERVAL: ${{ inputs.recover-interval }}
+      run: |
+        set -euo pipefail
+
+        # Capture a UTC timestamp BEFORE dispatching so the recovery filter only
+        # matches runs this action created, not pre-existing ones.
+        DISPATCH_TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)
+        echo "Dispatching $TARGET_WORKFLOW in $TARGET_REPO @ $TARGET_REF (since $DISPATCH_TS)"
+
+        # NOTE: do NOT pass -f cascade_version=... here. The suites do not define
+        # that input yet, so an extra input would error with "unexpected inputs".
+        # The version under test is computed and logged by the orchestrator but
+        # is inert until the suites accept the input.
+        gh workflow run "$TARGET_WORKFLOW" \
+          --repo "$TARGET_REPO" \
+          --ref "$TARGET_REF"
+
+        # Recover the run id. Cross-repo dispatch is async; the run may not be
+        # listable immediately, so poll with a bounded retry.
+        RUN_ID=""
+        for attempt in $(seq 1 "$RECOVER_ATTEMPTS"); do
+          RUN_ID=$(gh run list \
+            --repo "$TARGET_REPO" \
+            --workflow "$TARGET_WORKFLOW" \
+            --event workflow_dispatch \
+            --created ">=$DISPATCH_TS" \
+            --limit 20 \
+            --json databaseId,status,conclusion,createdAt \
+            --jq 'sort_by(.createdAt) | reverse | .[0].databaseId // empty')
+          if [ -n "$RUN_ID" ]; then
+            echo "Recovered run id $RUN_ID on attempt $attempt"
+            break
+          fi
+          echo "Run not visible yet (attempt $attempt/$RECOVER_ATTEMPTS); sleeping ${RECOVER_INTERVAL}s"
+          sleep "$RECOVER_INTERVAL"
+        done
+
+        if [ -z "$RUN_ID" ]; then
+          echo "::error::Could not recover a $TARGET_WORKFLOW run in $TARGET_REPO after dispatch"
+          exit 1
+        fi
+
+        RUN_URL="https://github.com/$TARGET_REPO/actions/runs/$RUN_ID"
+        echo "Watching $RUN_URL"
+        {
+          echo "- **$TARGET_REPO**: [run $RUN_ID]($RUN_URL)"
+        } >> "$GITHUB_STEP_SUMMARY"
+
+        # Block on the recovered run's conclusion. --exit-status makes gh return
+        # non-zero if the run concluded with a non-success result.
+        gh run watch "$RUN_ID" --repo "$TARGET_REPO" --exit-status
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -1,21 +1,21 @@
-# End-to-end test workflow
+# Integration test workflow (act + gitea testcontainers).
 # Triggers:
 #   push:tags        every release tag (existing)
 #   workflow_dispatch  manual run against any ref (existing)
 #   merge_group      runs as a merge-queue gate before merging to main
-#   schedule         nightly at 07:00 UTC (low-traffic window) against main
 #
-# E2E uses act + gitea testcontainers and is too slow + flaky to run per PR.
-# Run locally (`go test -v ./e2e/...`) before pushing instead.
-name: E2E
+# This workflow uses act + gitea testcontainers and is too slow + flaky to run
+# per PR. Run locally (`go test -v ./e2e/...`) before pushing instead.
+#
+# NOTE: the `name:` below is referenced by fleet-e2e.yaml's workflow_run trigger
+# ("Integration (act + gitea)"). Keep the two in sync if this is ever renamed.
+name: Integration (act + gitea)
 
 on:
   push:
     tags:
       - 'v*'
   merge_group:
-  schedule:
-    - cron: '0 7 * * *'
   workflow_dispatch:
     inputs:
       ref:
@@ -42,8 +42,8 @@ jobs:
       - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
         with:
           # workflow_dispatch: honour the explicit ref input.
-          # All other triggers (push:tags, merge_group, schedule): use the
-          # exact SHA that triggered the run so we test what GitHub resolved.
+          # All other triggers (push:tags, merge_group): use the exact SHA
+          # that triggered the run so we test what GitHub resolved.
           ref: ${{ github.event.inputs.ref || github.sha }}
 
       - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0

diff --git a/.github/workflows/fleet-e2e.yaml b/.github/workflows/fleet-e2e.yaml
@@ -0,0 +1,224 @@
+# Fleet E2E - revalidates the downstream cascade-example fleet on live GitHub.
+#
+# This is maintainer CI: hand-written tooling that lives in cascade's repo, not
+# a product feature and not part of cascade's generated output. A green Fleet
+# run means: this cascade version validated across all 8 example
+# repos, each running its own scenario-suite.yaml in its OWN repo context (own
+# token, own main, own manifest). It is the release-candidate fleet gate.
+#
+# Triggers:
+#   workflow_run  of "Integration (act + gitea)" on completion - makes the E2E
+#                 dependency NATIVE: Fleet only fans out once Integration is
+#                 green for an rc tag. No runner held open polling for it.
+#   workflow_dispatch  manual override (bypasses the rc-tag gate intentionally),
+#                 with an optional cascade_version input.
+#
+# IMPORTANT: the workflow_run trigger references the source workflow by its
+# `name:` ("Integration (act + gitea)"). Keep that name in sync with e2e.yaml.
+name: Fleet E2E (live GitHub)
+
+on:
+  workflow_run:
+    workflows: ["Integration (act + gitea)"]
+    types: [completed]
+  workflow_dispatch:
+    inputs:
+      cascade_version:
+        description: >-
+          cascade version to validate (e.g. v1.2.0-rc.1). Default empty resolves
+          to the rc tag on the workflow_run path. NOTE: passing this to the
+          suites is wired but inert until the suites accept the input.
+        required: false
+        default: ''
+
+permissions:
+  contents: read
+
+# Single in-flight fleet run per rc tag; a newer rc supersedes an older queued
+# fleet run rather than piling up live cross-repo dispatches.
+concurrency:
+  group: fleet-e2e-${{ github.event.workflow_run.head_branch || github.event.inputs.cascade_version || github.run_id }}
+  cancel-in-progress: false
+
+env:
+  # Eight downstream example repos. primary must finish before its two dependents
+  # (they mutate primary's shared external state); the rest are independent.
+  FLEET_OWNER: stablekernel
+
+jobs:
+  # Resolve the cascade version under test and re-assert the rc-tag gate as a
+  # job output so every fan-out job can gate on it cheaply.
+  resolve:
+    name: Resolve version under test
+    runs-on: ubuntu-latest
+    # Top-level guard: only fan out for a manual dispatch, or a green
+    # Integration run that was a push of an rc tag. This filters out
+    # merge_group / non-rc completions.
+    #
+    # workflow_run.head_branch carries the short ref name of whatever triggered
+    # the source run. For a tag push that is the tag's short name (e.g.
+    # v1.2.0-rc.1). We gate on it here AND, in the compute step below, resolve
+    # the tag from head_sha as a fallback in case head_branch is ever empty for
+    # a tag-triggered source run.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event.workflow_run.conclusion == 'success' &&
+       github.event.workflow_run.event == 'push' &&
+       startsWith(github.event.workflow_run.head_branch, 'v') &&
+       contains(github.event.workflow_run.head_branch, '-rc.'))
+    permissions:
+      contents: read
+      actions: read
+    outputs:
+      cascade_version: ${{ steps.compute.outputs.cascade_version }}
+    steps:
+      - name: Compute cascade version under test
+        id: compute
+        env:
+          # PAT is only needed for the head_sha -> tag fallback (a cross-ref
+          # lookup against this repo's tags). GITHUB_TOKEN would also work for
+          # same-repo reads, but we standardise on the fleet PAT.
+          GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
+          EVENT_NAME: ${{ github.event_name }}
+          INPUT_VERSION: ${{ github.event.inputs.cascade_version }}
+          WR_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+          WR_HEAD_SHA: ${{ github.event.workflow_run.head_sha }}
+        run: |
+          set -euo pipefail
+          if [ "$EVENT_NAME" = "workflow_dispatch" ] && [ -n "$INPUT_VERSION" ]; then
+            VERSION="$INPUT_VERSION"
+          elif [ -n "$WR_HEAD_BRANCH" ]; then
+            # Primary path: the rc tag short-name from the source push run.
+            VERSION="$WR_HEAD_BRANCH"
+          elif [ -n "$WR_HEAD_SHA" ]; then
+            # Fallback: head_branch was empty; resolve the rc tag pointing at the
+            # source run's head_sha. Tolerated to be empty (dispatch with no
+            # input), so guard the lookup.
+            # A sha can carry more than one rc tag; pick the highest by version
+            # sort so selection is deterministic regardless of API ordering.
+            VERSION=$(gh api "repos/${GITHUB_REPOSITORY}/tags" \
+              --jq ".[] | select(.commit.sha == \"$WR_HEAD_SHA\") | .name" \
+              | grep -- '-rc\.' | sort -V -r | head -n 1 || true)
+          else
+            VERSION=""
+          fi
+
+          echo "cascade_version=$VERSION" >> "$GITHUB_OUTPUT"
+          {
+            echo "## Fleet E2E"
+            echo ""
+            echo "Trigger: \`$EVENT_NAME\`"
+            echo "cascade version under test: \`${VERSION:-<empty>}\`"
+            echo ""
+            echo "> Version passing to suites is computed and logged here but"
+            echo "> currently INERT: the suites do not yet accept a"
+            echo "> \`cascade_version\` input."
+          } >> "$GITHUB_STEP_SUMMARY"
+
+  # Stage 1: primary must run and pass before its dependents.
+  primary:
+    name: primary
+    needs: resolve
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      actions: read
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+      - name: Dispatch and watch primary
+        uses: ./.github/actions/dispatch-suite
+        with:
+          repo: ${{ env.FLEET_OWNER }}/cascade-example-primary
+          token: ${{ secrets.CASCADE_STATE_TOKEN }}
+
+  # Stage 2: dependents of primary (mutate primary's shared external state),
+  # so they only start after primary is green.
+  dependents:
+    name: dependents (${{ matrix.repo }})
+    needs: primary
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      actions: read
+    strategy:
+      fail-fast: false
+      matrix:
+        repo: [artifact-a, artifact-b]
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+      - name: Dispatch and watch
+        uses: ./.github/actions/dispatch-suite
+        with:
+          repo: ${{ env.FLEET_OWNER }}/cascade-example-${{ matrix.repo }}
+          token: ${{ secrets.CASCADE_STATE_TOKEN }}
+
+  # Stage 3: independent suites, run in parallel with no ordering constraint.
+  independents:
+    name: independents (${{ matrix.repo }})
+    needs: resolve
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      actions: read
+    strategy:
+      fail-fast: false
+      matrix:
+        repo: [4env, 3env, 2env, single-env, release-only]
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+      - name: Dispatch and watch
+        uses: ./.github/actions/dispatch-suite
+        with:
+          repo: ${{ env.FLEET_OWNER }}/cascade-example-${{ matrix.repo }}
+          token: ${{ secrets.CASCADE_STATE_TOKEN }}
+
+  # Fan-in: this job's conclusion is the rc fleet gate. It fails if any upstream
+  # fan-out job failed and emits a per-repo pass/fail table to the summary.
+  aggregate:
+    name: Fleet gate
+    needs: [resolve, primary, dependents, independents]
+    # Only render a verdict when the fleet actually fanned out. On filtered-out
+    # completions (merge_group, non-rc tags, dispatch with no rc) resolve is
+    # skipped, so this job is skipped too and the run is a clean no-op rather
+    # than a false-red. A genuine fan-out failure still reds the run because
+    # resolve succeeded and the result checks below catch the failed stage.
+    if: always() && needs.resolve.result == 'success'
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Aggregate fleet result
+        env:
+          R_PRIMARY: ${{ needs.primary.result }}
+          R_DEPENDENTS: ${{ needs.dependents.result }}
+          R_INDEPENDENTS: ${{ needs.independents.result }}
+          VERSION: ${{ needs.resolve.outputs.cascade_version }}
+        run: |
+          set -euo pipefail
+          {
+            echo "## Fleet E2E result"
+            echo ""
+            echo "cascade version under test: \`${VERSION:-<empty>}\`"
+            echo ""
+            echo "| Stage | Result |"
+            echo "|---|---|"
+            echo "| primary | $R_PRIMARY |"
+            echo "| dependents (artifact-a, artifact-b) | $R_DEPENDENTS |"
+            echo "| independents (4env, 3env, 2env, single-env, release-only) | $R_INDEPENDENTS |"
+            echo ""
+            echo "> rc gate: this conclusion is the fleet validation signal for"
+            echo "> the rc tag. rc -> release promotion should consume the latest"
+            echo "> fleet-e2e conclusion for that tag before promoting."
+          } >> "$GITHUB_STEP_SUMMARY"
+
+          fail=0
+          for r in "$R_PRIMARY" "$R_DEPENDENTS" "$R_INDEPENDENTS"; do
+            if [ "$r" != "success" ]; then
+              fail=1
+            fi
+          done
+          if [ "$fail" -ne 0 ]; then
+            echo "::error::Fleet E2E failed: one or more suites did not pass"
+            exit 1
+          fi
+          echo "Fleet E2E passed across all suites"
diff --git a/.github/workflows/validate.yaml b/.github/workflows/validate.yaml
@@ -1,6 +1,13 @@
-# Validation workflow - runs tests and lint
-# Called by orchestrate workflow during CI/CD
-name: Validate
+# Tests & Lint - runs go test -race + coverage and golangci-lint.
+#
+# Triggers:
+#   workflow_call    invoked by orchestrate.yaml on PRs (keep - do not remove).
+#   push: tags       standalone run on every release/rc tag.
+#   workflow_dispatch  manual standalone run against any ref.
+#
+# The standalone triggers give this workflow runs of its own so its status
+# badge renders; a workflow_call-only workflow has no standalone runs to badge.
+name: Tests & Lint
 
 on:
   workflow_call:
@@ -14,6 +21,10 @@ on:
       result:
         description: 'Validation result (success/failure)'
         value: ${{ jobs.validate.outputs.result }}
+  push:
+    tags:
+      - 'v*'
+  workflow_dispatch:
 
 permissions:
   contents: read