-
Notifications
You must be signed in to change notification settings - Fork 0
448 lines (417 loc) · 19.8 KB
/
Copy pathfleet-e2e.yaml
File metadata and controls
448 lines (417 loc) · 19.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# Fleet E2E - revalidates the downstream cascade-example fleet on live GitHub.
#
# This is maintainer CI: hand-written tooling that lives in cascade's repo, not
# a product feature and not part of cascade's generated output. A green Fleet
# run means: this cascade version validated across all 8 example
# repos, each running its own scenario-suite.yaml in its OWN repo context (own
# token, own main, own manifest). It is the release-candidate fleet gate.
#
# Triggers:
# workflow_run of "Release" on completion - the fleet validates the PUBLISHED
# artifact. Release runs GoReleaser on every rc tag push, so a
# successful Release run means the binary the suites install is
# actually on the releases page. Fleet only fans out once that
# publish succeeded for an rc tag. No runner held open polling.
# workflow_dispatch manual override (bypasses the rc-tag gate intentionally),
# with an optional cascade_version input.
#
# We key off "Release" rather than the `release:` event because promote-driven
# API releases do not reliably emit `release: published` (see release.yaml #86);
# the rc-tag push that drives GoReleaser is the dependable signal.
#
# IMPORTANT: the workflow_run trigger references the source workflow by its
# `name:` ("Release"). Keep that name in sync with release.yaml.
name: Fleet E2E (live GitHub)
on:
workflow_run:
workflows: ["Release"]
types: [completed]
workflow_dispatch:
inputs:
cascade_version:
description: >-
cascade version to validate (e.g. v1.2.0-rc.1). Default empty resolves
to the rc tag on the workflow_run path. NOTE: passing this to the
suites is wired but inert until the suites accept the input.
required: false
default: ''
permissions:
contents: read
# Single in-flight fleet run per rc tag; a newer rc supersedes an older queued
# fleet run rather than piling up live cross-repo dispatches.
concurrency:
group: fleet-e2e-${{ github.event.workflow_run.head_branch || github.event.inputs.cascade_version || github.run_id }}
cancel-in-progress: false
env:
# Eight downstream example repos. primary must finish before its two dependents
# (they mutate primary's shared external state); the rest are independent.
FLEET_OWNER: stablekernel
jobs:
# Resolve the cascade version under test and re-assert the rc-tag gate as a
# job output so every fan-out job can gate on it cheaply.
resolve:
name: Resolve version under test
runs-on: ubuntu-latest
# Top-level guard: only fan out for a manual dispatch, or a green
# Release run that was a push of an rc tag. This filters out
# non-rc tag publishes and any non-success completions.
#
# workflow_run.head_branch carries the short ref name of whatever triggered
# the source run. For a tag push that is the tag's short name (e.g.
# v1.2.0-rc.1). We gate on it here AND, in the compute step below, resolve
# the tag from head_sha as a fallback in case head_branch is ever empty for
# a tag-triggered source run.
if: >-
github.event_name == 'workflow_dispatch' ||
(github.event.workflow_run.conclusion == 'success' &&
github.event.workflow_run.event == 'push' &&
startsWith(github.event.workflow_run.head_branch, 'v') &&
contains(github.event.workflow_run.head_branch, '-rc.'))
permissions:
contents: read
actions: read
outputs:
cascade_version: ${{ steps.compute.outputs.cascade_version }}
steps:
- name: Compute cascade version under test
id: compute
env:
# PAT is only needed for the head_sha -> tag fallback (a cross-ref
# lookup against this repo's tags). GITHUB_TOKEN would also work for
# same-repo reads, but we standardise on the fleet PAT.
GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
EVENT_NAME: ${{ github.event_name }}
INPUT_VERSION: ${{ github.event.inputs.cascade_version }}
WR_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
WR_HEAD_SHA: ${{ github.event.workflow_run.head_sha }}
run: |
set -euo pipefail
if [ "$EVENT_NAME" = "workflow_dispatch" ] && [ -n "$INPUT_VERSION" ]; then
VERSION="$INPUT_VERSION"
elif [ -n "$WR_HEAD_BRANCH" ]; then
# Primary path: the rc tag short-name from the source push run.
VERSION="$WR_HEAD_BRANCH"
elif [ -n "$WR_HEAD_SHA" ]; then
# Fallback: head_branch was empty; resolve the rc tag pointing at the
# source run's head_sha. Tolerated to be empty (dispatch with no
# input), so guard the lookup.
# A sha can carry more than one rc tag; pick the highest by version
# sort so selection is deterministic regardless of API ordering.
VERSION=$(gh api "repos/${GITHUB_REPOSITORY}/tags" \
--jq ".[] | select(.commit.sha == \"$WR_HEAD_SHA\") | .name" \
| grep -- '-rc\.' | sort -V -r | head -n 1 || true)
else
VERSION=""
fi
echo "cascade_version=$VERSION" >> "$GITHUB_OUTPUT"
# Persist the resolved version so it can cross the workflow_run
# boundary into auto-promote. This is the exact value every suite is
# pinned to, so a green fleet and the promoted base never disagree.
printf '%s' "$VERSION" > version-under-test.txt
{
echo "## Fleet E2E"
echo ""
echo "Trigger: \`$EVENT_NAME\`"
echo "cascade version under test: \`${VERSION:-<empty>}\`"
echo ""
echo "> The repin job pins all 8 example repos to this version"
echo "> before any suite fans out, so the suites run the binary"
echo "> named here rather than a stale pinned one."
} >> "$GITHUB_STEP_SUMMARY"
# Hand the resolved version-under-test to auto-promote. A workflow_run
# does not inherit the triggering run's dispatch inputs, so auto-promote
# reads this artifact as the authoritative version the fleet validated
# rather than guessing from head_branch (which is `main` on the
# workflow_dispatch path).
- name: Upload resolved version-under-test
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: version-under-test
path: version-under-test.txt
if-no-files-found: error
retention-days: 7
# Repin: pin every example repo to the rc UNDER TEST before any suite fans
# out. Without this the suites would install whatever version each repo's
# manifest is statically pinned to, so a fresh rc would never actually run -
# the "version under test" label would outrun reality. This job downloads the
# rc binary, regenerates each repo's workflows against it, and pushes the
# repin to each repo's main (idempotent: no change -> no commit). Every suite
# job gates on this job so none can start against a stale pin.
repin:
name: Repin fleet to rc
needs: resolve
runs-on: ubuntu-latest
permissions:
contents: read
env:
RC_VERSION: ${{ needs.resolve.outputs.cascade_version }}
STATE_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
GH_TOKEN: ${{ secrets.CASCADE_STATE_TOKEN }}
steps:
- name: Download the rc cascade binary
env:
REPO: ${{ github.repository }}
run: |
set -euo pipefail
if [ -z "${RC_VERSION:-}" ]; then
echo "::error::No cascade version resolved; cannot repin the fleet"
exit 1
fi
# GoReleaser strips the leading v from the embedded version, so
# `cascade version` prints the tag WITHOUT it. Keep both forms: the
# v-prefixed tag for release/manifest refs, the bare form for the
# binary self-report comparison.
RC_BARE="${RC_VERSION#v}"
echo "RC_BARE=$RC_BARE" >> "$GITHUB_ENV"
TMPDIR=$(mktemp -d)
echo "Downloading $RC_VERSION linux/amd64 archive from $REPO"
gh release download "$RC_VERSION" \
--repo "$REPO" \
--pattern '*linux_amd64*' \
--dir "$TMPDIR"
tar -xzf "$TMPDIR"/*.tar.gz -C "$TMPDIR"
install -m 0755 "$TMPDIR/cascade" /usr/local/bin/cascade
rm -rf "$TMPDIR"
INSTALLED=$(cascade version 2>/dev/null | head -n 1 | awk '{print $2}')
# Tolerate a leading v in the self-report so the check tracks the
# release tag rather than a future ldflags formatting choice.
echo "Installed cascade version: $INSTALLED (expected $RC_BARE)"
if [ "${INSTALLED#v}" != "$RC_BARE" ]; then
echo "::error::Downloaded binary reports '$INSTALLED' but expected '$RC_BARE'"
exit 1
fi
- name: Configure git identity
run: |
set -euo pipefail
git config --global user.name "cascade-fleet-bot"
git config --global user.email "cascade-fleet-bot@users.noreply.github.com"
- name: Repin each example repo to the rc
run: |
set -euo pipefail
# The 8 example repos. Repinning means: set manifest cli_version to the
# rc, replace any other in-repo rc-version refs, regenerate the workflows
# with the rc binary, then commit + push only if something changed. This
# preserves every hand-written suite feature: regeneration only rewrites
# the generated workflows, and we touch nothing else.
REPOS="primary artifact-a artifact-b 4env 3env 2env single-env release-only"
# Apply the repin mutation to the checkout in the current directory:
# point cli_version at the rc, rewrite any other in-repo rc refs, then
# regenerate the generated workflows. Re-runnable, because the retry
# loop resets the tree to the fetched remote tip and re-applies this on
# top of it (mirroring cascade's commitWithApplicationRetry).
apply_repin() {
local manifest="$1"
# 1. Point the manifest cli_version at the rc.
sed -i -E "s|^([[:space:]]*cli_version:[[:space:]]*).*$|\1${RC_VERSION}|" "$manifest"
# 2. Replace any other in-repo rc-version refs (e.g. an explicit
# setup-cli@v..-rc.. pin a suite hand-wrote) with the rc. Scope
# to tracked text files; the regen below rewrites generated
# workflows, this catches anything outside them.
while IFS= read -r f; do
[ -f "$f" ] || continue
sed -i -E "s|v[0-9]+\.[0-9]+\.[0-9]+-rc\.[0-9]+|${RC_VERSION}|g" "$f"
done < <(grep -rlE "v[0-9]+\.[0-9]+\.[0-9]+-rc\.[0-9]+" . --include='*.yaml' --include='*.yml' 2>/dev/null || true)
# 3. Regenerate the workflows with the rc binary. This rewrites the
# generated setup-cli refs to the rc and nothing hand-written.
cascade generate-workflow --force -c "$manifest"
}
# Repin one repo. Returns non-zero on any regen or push failure so the
# caller can record it and red the job. The example repos' main is
# protected, but the fleet token has write access (the suites' own
# state-writes to the same main succeed). A force push is rejected by
# the ruleset; a NORMAL fast-forward push is not. We clone fresh main,
# so the first push is a fast-forward. On a non-fast-forward rejection
# (a concurrent write landed) we fetch/reset/re-apply/retry, up to
# MAX_ATTEMPTS, exactly as cascade's state-writer does.
MAX_ATTEMPTS=5
repin_repo() {
local slug="$1"
local workdir manifest attempt status push_out
workdir=$(mktemp -d)
git clone --depth 1 \
"https://x-access-token:${STATE_TOKEN}@github.com/${slug}.git" \
"$workdir" || return 1
cd "$workdir" || return 1
manifest=".github/manifest.yaml"
if [ ! -f "$manifest" ]; then
echo "::error::${slug} has no ${manifest}"
return 1
fi
for attempt in $(seq 1 "$MAX_ATTEMPTS"); do
apply_repin "$manifest" || return 1
# No diff means the remote already matches the rc: nothing to push.
if [ -z "$(git status --porcelain)" ]; then
echo "${slug} already at ${RC_VERSION}; nothing to repin"
return 0
fi
git add -A
# CI has no GPG key, so DCO sign-off only (-s) with signing
# explicitly disabled. The example repos are not GPG-gated.
# [skip ci] keeps this push from triggering the repo's own
# orchestrate workflow.
git -c commit.gpgsign=false commit --no-gpg-sign -s \
-m "chore: repin to ${RC_VERSION} [skip ci]" || return 1
# NORMAL push (no --force). Capture the exit status explicitly so a
# ruleset rejection fails the repo rather than being swallowed.
set +e
push_out=$(git push origin HEAD:main 2>&1)
status=$?
set -e
if [ "$status" -eq 0 ]; then
echo "${slug} repinned to ${RC_VERSION} (attempt ${attempt})"
return 0
fi
echo "push attempt ${attempt}/${MAX_ATTEMPTS} for ${slug} failed:"
echo "$push_out"
# Recover from a non-fast-forward rejection: reset onto the freshly
# fetched remote tip and re-apply on the next iteration. A genuine
# write-access (ruleset) rejection cannot fast-forward away, so it
# surfaces here and on the final attempt reds the repo.
git fetch origin main || return 1
git reset --hard origin/main || return 1
sleep "$attempt"
done
echo "::error::${slug} push rejected after ${MAX_ATTEMPTS} attempts (last output above)"
return 1
}
# Confirm the repo's main actually carries the rc cli_version after the
# push. Belt-and-suspenders: a silent no-op (push that landed nothing)
# can never report green because this reads the published main back.
verify_pinned() {
local slug="$1" actual
actual=$(gh api "repos/${slug}/contents/.github/manifest.yaml" \
--jq '.content' | base64 -d \
| grep -E "^[[:space:]]*cli_version:" | head -n 1 \
| sed -E 's|^[[:space:]]*cli_version:[[:space:]]*||' | tr -d '"' | tr -d "'") || return 1
if [ "$actual" != "$RC_VERSION" ]; then
echo "::error::${slug} main cli_version is '${actual}', expected '${RC_VERSION}'"
return 1
fi
echo "${slug} main verified at ${RC_VERSION}"
}
failed=""
for name in $REPOS; do
slug="${FLEET_OWNER}/cascade-example-${name}"
echo "::group::repin ${slug} -> ${RC_VERSION}"
ok=1
( repin_repo "$slug" ) || ok=0
if [ "$ok" -eq 1 ]; then
verify_pinned "$slug" || ok=0
fi
[ "$ok" -eq 1 ] || failed="${failed} ${slug}"
echo "::endgroup::"
done
if [ -n "$failed" ]; then
echo "::error::Repin failed for:${failed}"
exit 1
fi
echo "All example repos pinned to ${RC_VERSION}"
# Stage 1: primary must run and pass before its dependents. Gated on repin so
# it never runs against a stale pin.
primary:
name: primary
needs: [resolve, repin]
runs-on: ubuntu-latest
permissions:
contents: read
actions: read
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Dispatch and watch primary
uses: ./.github/actions/dispatch-suite
with:
repo: ${{ env.FLEET_OWNER }}/cascade-example-primary
token: ${{ secrets.CASCADE_STATE_TOKEN }}
# Stage 2: dependents of primary (mutate primary's shared external state),
# so they only start after primary is green.
dependents:
name: dependents (${{ matrix.repo }})
needs: primary
runs-on: ubuntu-latest
permissions:
contents: read
actions: read
strategy:
fail-fast: false
matrix:
repo: [artifact-a, artifact-b]
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Dispatch and watch
uses: ./.github/actions/dispatch-suite
with:
repo: ${{ env.FLEET_OWNER }}/cascade-example-${{ matrix.repo }}
token: ${{ secrets.CASCADE_STATE_TOKEN }}
# Stage 3: independent suites, run in parallel with no ordering constraint
# beyond repin (so they never run against a stale pin).
independents:
name: independents (${{ matrix.repo }})
needs: [resolve, repin]
runs-on: ubuntu-latest
permissions:
contents: read
actions: read
strategy:
fail-fast: false
matrix:
repo: [4env, 3env, 2env, single-env, release-only]
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Dispatch and watch
uses: ./.github/actions/dispatch-suite
with:
repo: ${{ env.FLEET_OWNER }}/cascade-example-${{ matrix.repo }}
token: ${{ secrets.CASCADE_STATE_TOKEN }}
# Fan-in: this job's conclusion is the rc fleet gate. It fails if any upstream
# fan-out job failed and emits a per-repo pass/fail table to the summary.
aggregate:
name: Fleet gate
needs: [resolve, repin, primary, dependents, independents]
# Only render a verdict when the fleet actually fanned out. On filtered-out
# completions (merge_group, non-rc tags, dispatch with no rc) resolve is
# skipped, so this job is skipped too and the run is a clean no-op rather
# than a false-red. A genuine fan-out failure still reds the run because
# resolve succeeded and the result checks below catch the failed stage.
if: always() && needs.resolve.result == 'success'
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Aggregate fleet result
env:
R_REPIN: ${{ needs.repin.result }}
R_PRIMARY: ${{ needs.primary.result }}
R_DEPENDENTS: ${{ needs.dependents.result }}
R_INDEPENDENTS: ${{ needs.independents.result }}
VERSION: ${{ needs.resolve.outputs.cascade_version }}
run: |
set -euo pipefail
{
echo "## Fleet E2E result"
echo ""
echo "cascade version under test (pinned into every suite): \`${VERSION:-<empty>}\`"
echo ""
echo "| Stage | Result |"
echo "|---|---|"
echo "| repin (all 8 repos to rc) | $R_REPIN |"
echo "| primary | $R_PRIMARY |"
echo "| dependents (artifact-a, artifact-b) | $R_DEPENDENTS |"
echo "| independents (4env, 3env, 2env, single-env, release-only) | $R_INDEPENDENTS |"
echo ""
echo "> rc gate: this conclusion is the fleet validation signal for"
echo "> the rc tag. The repin step pinned each suite to this rc before"
echo "> fan-out, so a green gate validates the binary named above."
echo "> rc -> release promotion should consume the latest fleet-e2e"
echo "> conclusion for that tag before promoting."
} >> "$GITHUB_STEP_SUMMARY"
fail=0
for r in "$R_REPIN" "$R_PRIMARY" "$R_DEPENDENTS" "$R_INDEPENDENTS"; do
if [ "$r" != "success" ]; then
fail=1
fi
done
if [ "$fail" -ne 0 ]; then
echo "::error::Fleet E2E failed: one or more suites did not pass"
exit 1
fi
echo "Fleet E2E passed across all suites"