From 2a0ba235a610c1b080f8746417c1579a9fca7e39 Mon Sep 17 00:00:00 2001 From: Alex Newman Date: Sat, 18 Apr 2026 21:26:04 +0000 Subject: [PATCH] ci(deploy-cp): run STONITH verify last, after the agent relaunch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The relaunch cascade itself triggers a second wave of STONITH activity: the tdx2 agent re-registers with the fresh CP, its old CF tunnel is deleted, and the old CP's self-watchdog then poweroffs. Verifying after the relaunch captures that wave — the previous position verified only the kill that happens when the new CP first registers its own tunnel. Also reorders user-facing deliverables (PR comment, local agent back online) ahead of the slowest + flakiest check (24×5s loop with a fallback `gcloud delete`), so those land first. No behavior change on the happy path. If relaunch fails, the STONITH verification no longer runs — but operator attention is already needed in that case, and the old VMs self-terminate via the CF-tunnel-delete trigger at register time either way. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/deploy-cp.yml | 77 ++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml index 772d750..f3fff7a 100644 --- a/.github/workflows/deploy-cp.yml +++ b/.github/workflows/deploy-cp.yml @@ -275,41 +275,6 @@ jobs: echo "::error::dashboard / never returned 200 (last HTTP ${code})" exit 1 - - name: Verify STONITH halted prior VM(s) in this env - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - run: | - # dd-register STONITHs the old VM on startup by deleting its - # CF tunnel → old cloudflared exits → old dd-register poweroffs. - # Scoped to this env — per-PR previews are hostname-isolated, - # so this only reaps prior deploys of the same env. - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM" - - name: Comment preview URL on PR if: inputs.comment_on_pr && github.event_name == 'pull_request' uses: actions/github-script@v7 @@ -365,3 +330,45 @@ jobs: host: ${{ secrets.DD_LOCAL_HOST }} dd-pat: ${{ secrets.GITHUB_TOKEN }} ita-api-key: ${{ secrets.DD_ITA_API_KEY }} + + # Runs last so the relaunch cascade's own STONITH wave (old agent + # re-registers → old CP's CF tunnel is deleted → old CP poweroffs) + # is captured by this verification, in addition to the kill that + # happens when the new CP first registers its own tunnel. Also + # keeps the slowest + flakiest verify (24×5s loop + fallback + # force-delete) behind the user-facing outputs (PR comment, + # relaunched local agent). + - name: Verify STONITH halted prior VM(s) in this env + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + # dd-register STONITHs the old VM on startup by deleting its + # CF tunnel → old cloudflared exits → old dd-register poweroffs. + # Scoped to this env — per-PR previews are hostname-isolated, + # so this only reaps prior deploys of the same env. + NEW_VM=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + echo "new VM: $NEW_VM" + SURVIVORS="" + for i in $(seq 1 24); do + SURVIVORS=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ + --format="value(name)" \ + | grep -vx "$NEW_VM" || true) + if [ -z "$SURVIVORS" ]; then + echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" + exit 0 + fi + echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" + echo " waiting for STONITH poweroff... (${i}/24)" + sleep 5 + done + echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" + echo "$SURVIVORS" + # shellcheck disable=SC2086 + gcloud compute instances delete $SURVIVORS \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true + echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM"