diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml index 772d750..f3fff7a 100644 --- a/.github/workflows/deploy-cp.yml +++ b/.github/workflows/deploy-cp.yml @@ -275,41 +275,6 @@ jobs: echo "::error::dashboard / never returned 200 (last HTTP ${code})" exit 1 - - name: Verify STONITH halted prior VM(s) in this env - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - run: | - # dd-register STONITHs the old VM on startup by deleting its - # CF tunnel → old cloudflared exits → old dd-register poweroffs. - # Scoped to this env — per-PR previews are hostname-isolated, - # so this only reaps prior deploys of the same env. - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM" - - name: Comment preview URL on PR if: inputs.comment_on_pr && github.event_name == 'pull_request' uses: actions/github-script@v7 @@ -365,3 +330,45 @@ jobs: host: ${{ secrets.DD_LOCAL_HOST }} dd-pat: ${{ secrets.GITHUB_TOKEN }} ita-api-key: ${{ secrets.DD_ITA_API_KEY }} + + # Runs last so the relaunch cascade's own STONITH wave (old agent + # re-registers → old CP's CF tunnel is deleted → old CP poweroffs) + # is captured by this verification, in addition to the kill that + # happens when the new CP first registers its own tunnel. Also + # keeps the slowest + flakiest verify (24×5s loop + fallback + # force-delete) behind the user-facing outputs (PR comment, + # relaunched local agent). + - name: Verify STONITH halted prior VM(s) in this env + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + # dd-register STONITHs the old VM on startup by deleting its + # CF tunnel → old cloudflared exits → old dd-register poweroffs. + # Scoped to this env — per-PR previews are hostname-isolated, + # so this only reaps prior deploys of the same env. + NEW_VM=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + echo "new VM: $NEW_VM" + SURVIVORS="" + for i in $(seq 1 24); do + SURVIVORS=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ + --format="value(name)" \ + | grep -vx "$NEW_VM" || true) + if [ -z "$SURVIVORS" ]; then + echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" + exit 0 + fi + echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" + echo " waiting for STONITH poweroff... (${i}/24)" + sleep 5 + done + echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" + echo "$SURVIVORS" + # shellcheck disable=SC2086 + gcloud compute instances delete $SURVIVORS \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true + echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM"