Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 42 additions & 35 deletions .github/workflows/deploy-cp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,41 +275,6 @@ jobs:
echo "::error::dashboard / never returned 200 (last HTTP ${code})"
exit 1

- name: Verify STONITH halted prior VM(s) in this env
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
run: |
# dd-register STONITHs the old VM on startup by deleting its
# CF tunnel → old cloudflared exits → old dd-register poweroffs.
# Scoped to this env — per-PR previews are hostname-isolated,
# so this only reaps prior deploys of the same env.
NEW_VM=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \
--format="value(name)" --sort-by=~creationTimestamp | head -1)
echo "new VM: $NEW_VM"
SURVIVORS=""
for i in $(seq 1 24); do
SURVIVORS=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \
--format="value(name)" \
| grep -vx "$NEW_VM" || true)
if [ -z "$SURVIVORS" ]; then
echo "STONITH verified — only $NEW_VM running in ${DD_ENV}"
exit 0
fi
echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')"
echo " waiting for STONITH poweroff... (${i}/24)"
sleep 5
done
echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:"
echo "$SURVIVORS"
# shellcheck disable=SC2086
gcloud compute instances delete $SURVIVORS \
--project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true
echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM"

- name: Comment preview URL on PR
if: inputs.comment_on_pr && github.event_name == 'pull_request'
uses: actions/github-script@v7
Expand Down Expand Up @@ -365,3 +330,45 @@ jobs:
host: ${{ secrets.DD_LOCAL_HOST }}
dd-pat: ${{ secrets.GITHUB_TOKEN }}
ita-api-key: ${{ secrets.DD_ITA_API_KEY }}

# Runs last so the relaunch cascade's own STONITH wave (old agent
# re-registers → old CP's CF tunnel is deleted → old CP poweroffs)
# is captured by this verification, in addition to the kill that
# happens when the new CP first registers its own tunnel. Also
# keeps the slowest + flakiest verify (24×5s loop + fallback
# force-delete) behind the user-facing outputs (PR comment,
# relaunched local agent).
- name: Verify STONITH halted prior VM(s) in this env
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
run: |
# dd-register STONITHs the old VM on startup by deleting its
# CF tunnel → old cloudflared exits → old dd-register poweroffs.
# Scoped to this env — per-PR previews are hostname-isolated,
# so this only reaps prior deploys of the same env.
NEW_VM=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \
--format="value(name)" --sort-by=~creationTimestamp | head -1)
echo "new VM: $NEW_VM"
SURVIVORS=""
for i in $(seq 1 24); do
SURVIVORS=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \
--format="value(name)" \
| grep -vx "$NEW_VM" || true)
if [ -z "$SURVIVORS" ]; then
echo "STONITH verified — only $NEW_VM running in ${DD_ENV}"
exit 0
fi
echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')"
echo " waiting for STONITH poweroff... (${i}/24)"
sleep 5
done
echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:"
echo "$SURVIVORS"
# shellcheck disable=SC2086
gcloud compute instances delete $SURVIVORS \
--project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true
echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM"
Loading