Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions .github/actions/relaunch-agent/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Relaunch local TDX agent
description: >-
SSH into the tdx2 host and recreate the matching dd-local-{kind} libvirt
domain against the given CP url, pulling scripts from the given git ref.
Shared between Local Agents (push/PR/dispatch) and Deploy CP (cascading
relaunch after a successful CP deploy).

inputs:
kind:
description: 'prod | preview — which libvirt domain to relaunch'
required: true
url:
description: 'CP URL the agent should register against (e.g. https://app.devopsdefender.com)'
required: true
ref:
description: 'git ref whose scripts/apps tree dd-relaunch.sh should check out on the host'
required: true
ssh-key:
description: 'Private SSH key for tdx2@host'
required: true
host:
description: 'Public host address of the tdx2 node'
required: true
dd-pat:
description: 'GitHub PAT the agent uses to talk to the CP'
required: true
ita-api-key:
description: 'Intel Trust Authority API key for attestation'
required: true

runs:
using: composite
steps:
# CP must be reachable before we SSH — on PR pushes we race with
# Release's deploy-preview standing up the pr-N CP. /health is public.
- name: Wait for CP to be healthy
shell: bash
env:
URL: ${{ inputs.url }}
run: |
for i in $(seq 1 60); do
if curl -fsS --max-time 5 "$URL/health" >/dev/null 2>&1; then
echo "CP $URL healthy after ${i} attempts"
exit 0
fi
echo " waiting for $URL... (${i}/60)"
sleep 10
done
echo "::error::CP $URL never came up within 10 min"
exit 1

# SSH in and relaunch the VM (destroy + redefine + start). Finishes
# in ~10 s — the baked config.iso's EE_BOOT_WORKLOADS drives the rest.
- name: ssh + relaunch VM
shell: bash
env:
SSH_KEY: ${{ inputs.ssh-key }}
HOST: ${{ inputs.host }}
DD_PAT: ${{ inputs.dd-pat }}
DD_ITA_API_KEY: ${{ inputs.ita-api-key }}
KIND: ${{ inputs.kind }}
URL: ${{ inputs.url }}
REF: ${{ inputs.ref }}
run: |
mkdir -p ~/.ssh
printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null
ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \
-i ~/.ssh/id_ed25519 "tdx2@$HOST" \
"DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL' '$REF'"
307 changes: 307 additions & 0 deletions .github/workflows/deploy-cp.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
name: Deploy CP

# Reusable workflow: provision the CP TDX VM on GCP, wait for it to be
# healthy, verify attestation + dashboard + STONITH, and cascade a
# relaunch of the matching dd-local agent VM. Called from release.yml
# (preview path) and production-deploy.yml (prod path) with different
# inputs — both paths share this exact set of verification steps, so
# preview CI exercises the same code that prod runs.
#
# GitHub Actions allows ≤4 levels of workflow_call nesting. Today's
# chain is `release.yml → deploy-cp.yml` (2) and
# `production-deploy.yml → deploy-cp.yml` (2) — deep enough headroom
# that we can still call one more reusable workflow below us if needed.
# The agent-relaunch cascade uses a composite action (same-job, no
# nesting) to keep that headroom.

on:
workflow_call:
inputs:
env:
description: 'DD_ENV (e.g. "production", "pr-42")'
required: true
type: string
hostname:
description: 'Public hostname (e.g. app.devopsdefender.com)'
required: true
type: string
gcp_environment:
description: 'GitHub environment name — "production" | "staging"'
required: true
type: string
workload_identity_provider:
description: 'GCP Workload Identity Federation provider resource name'
required: true
type: string
service_account:
description: 'GCP service account email'
required: true
type: string
release_tag:
description: 'devopsdefender release tag to deploy (e.g. "latest", "pr-abc123")'
required: true
type: string
oauth_enabled:
description: 'Enable GitHub OAuth (prod only; previews use PAT)'
required: false
type: boolean
default: false
comment_on_pr:
description: 'Leave a PR comment with the preview URL'
required: false
type: boolean
default: false
relaunch_agent:
description: 'After CP deploy, cascade a relaunch of dd-local-{env} via SSH'
required: false
type: boolean
default: true
ref:
description: 'Git ref the tdx2 host should pull before relaunching the agent VM'
required: false
type: string
default: main

concurrency:
group: deploy-cp-${{ inputs.env }}
cancel-in-progress: false

jobs:
deploy:
runs-on: ubuntu-latest
environment: ${{ inputs.gcp_environment }}
permissions:
contents: read
id-token: write
pull-requests: write
env:
DD_ENV: ${{ inputs.env }}
DD_HOSTNAME: ${{ inputs.hostname }}
GCP_ZONE: us-central1-c
steps:
- uses: actions/checkout@v4

- uses: google-github-actions/auth@v2
with:
workload_identity_provider: ${{ inputs.workload_identity_provider }}
service_account: ${{ inputs.service_account }}
- uses: google-github-actions/setup-gcloud@v2

- name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases)
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }}
CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }}
CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }}
CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }}
# OAuth only in environments that have these set (production).
# When empty, gcp-deploy.sh omits the workload env vars →
# dd-web disables /auth/github/* and serves /auth/pat only.
DD_GITHUB_CLIENT_ID: ${{ inputs.oauth_enabled && (vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID) || '' }}
DD_GITHUB_CALLBACK_URL: ${{ inputs.oauth_enabled && vars.DD_GITHUB_CALLBACK_URL || '' }}
DD_GITHUB_CLIENT_SECRET: ${{ inputs.oauth_enabled && secrets.DD_GITHUB_CLIENT_SECRET || '' }}
# ITA — optional. When set, the CP mints + verifies quotes.
DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }}
DD_RELEASE_TAG: ${{ inputs.release_tag }}
run: scripts/gcp-deploy.sh

- name: Wait for agent health (streams serial console)
env:
AGENT_URL: https://${{ inputs.hostname }}
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
run: |
VM_NAME=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \
--format="value(name)" --sort-by=~creationTimestamp | head -1)
if [ -z "$VM_NAME" ]; then
echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed"
exit 1
fi
echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)"

LAST_LINES=0
for i in $(seq 1 60); do
# Stream serial console so boot failures (DHCP hang, release
# fetch error, cloudflared exit, etc.) are visible without
# shelling into GCP.
gcloud compute instances get-serial-port-output "$VM_NAME" \
--project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \
> /tmp/serial.log || true
TOTAL_LINES=$(wc -l < /tmp/serial.log)
if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then
tail -n +$((LAST_LINES + 1)) /tmp/serial.log \
| sed 's/^/[serial] /'
LAST_LINES=$TOTAL_LINES
fi

if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then
echo "::error::boot failed — serial log shows fatal pattern"
exit 1
fi

if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then
echo "Agent healthy at ${AGENT_URL}"
exit 0
fi
echo " waiting for tunnel... (${i}/60)"
sleep 5
done
echo "::error::Agent not healthy within 5 minutes"
echo "--- final serial tail ---"
tail -80 /tmp/serial.log | sed 's/^/[serial] /'
exit 1

- name: Verify NEW VM via TDX attestation
env:
AGENT_URL: https://${{ inputs.hostname }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# /cp/attest proves the freshly-deployed VM is serving the tunnel
# (stale tunnels point at old VMs that 404 on this endpoint).
# MRTD = 48 bytes at offset 184 in TDX quote v4; if non-zero,
# attestation actually worked.
NONCE=$(openssl rand -base64 16)
for attempt in $(seq 1 60); do
BODY=$(curl -sG -w '\n%{http_code}' \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
--data-urlencode "nonce=${NONCE}" \
"${AGENT_URL}/cp/attest" || echo $'\n000')
CODE=$(echo "$BODY" | tail -n1)
JSON=$(echo "$BODY" | sed '$d')
if [ "$CODE" = "200" ]; then
QUOTE_B64=$(echo "$JSON" | jq -r '.quote_b64 // empty')
if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then
MRTD=$(echo "$QUOTE_B64" | base64 -d \
| dd bs=1 skip=184 count=48 status=none | xxd -p -c 48)
if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then
echo "NEW VM verified — MRTD: $MRTD"
exit 0
fi
echo " /cp/attest 200 but MRTD empty/zero, retrying... (${attempt}/60)"
else
echo " /cp/attest 200 but no quote_b64, retrying... (${attempt}/60)"
fi
else
echo " /cp/attest returned HTTP ${CODE}, retrying... (${attempt}/60)"
fi
sleep 10
done
echo "::error::/cp/attest never returned a valid quote — stale tunnel or new VM never came up"
exit 1

- name: Verify dashboard renders
env:
AGENT_URL: https://${{ inputs.hostname }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Fast sanity check on top of /cp/attest — proves dd-web is up
# and accepts the CI PAT's Bearer auth.
for attempt in $(seq 1 12); do
code=$(curl -s -o /dev/null -w '%{http_code}' \
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
"${AGENT_URL}/" || echo 000)
if [ "$code" = "200" ]; then
echo "Dashboard renders (HTTP 200, attempt ${attempt})"
exit 0
fi
echo " dashboard returned HTTP ${code}, retrying... (${attempt}/12)"
sleep 5
done
echo "::error::dashboard / never returned 200 (last HTTP ${code})"
exit 1

- name: Verify STONITH halted prior VM(s) in this env
env:
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
run: |
# dd-register STONITHs the old VM on startup by deleting its
# CF tunnel → old cloudflared exits → old dd-register poweroffs.
# Scoped to this env — per-PR previews are hostname-isolated,
# so this only reaps prior deploys of the same env.
NEW_VM=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \
--format="value(name)" --sort-by=~creationTimestamp | head -1)
echo "new VM: $NEW_VM"
SURVIVORS=""
for i in $(seq 1 24); do
SURVIVORS=$(gcloud compute instances list \
--project="$GCP_PROJECT_ID" \
--filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \
--format="value(name)" \
| grep -vx "$NEW_VM" || true)
if [ -z "$SURVIVORS" ]; then
echo "STONITH verified — only $NEW_VM running in ${DD_ENV}"
exit 0
fi
echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')"
echo " waiting for STONITH poweroff... (${i}/24)"
sleep 5
done
echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:"
echo "$SURVIVORS"
# shellcheck disable=SC2086
gcloud compute instances delete $SURVIVORS \
--project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true
echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM"

- name: Comment preview URL on PR
if: inputs.comment_on_pr && github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const url = `https://${{ inputs.hostname }}`;
const body = [
`### DD preview ready`,
``,
`**URL:** ${url}`,
``,
`Browser login: paste \`gh auth token\` output at ${url}/auth/pat`,
``,
`CLI / curl: \`curl -H "Authorization: Bearer $(gh auth token)" ${url}/\``,
``,
`Register endpoint for a local agent: \`wss://${{ inputs.hostname }}/register\``,
].join('\n');
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const marker = '### DD preview ready';
const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}

# Cascade a relaunch of the matching dd-local-{env} libvirt domain
# on the tdx2 host. Preview runs dd-local-preview against the PR's
# CP; prod runs dd-local-prod against app.devopsdefender.com.
# Non-blocking (`continue-on-error`) because the openclaw boot
# chain inside dd-local-preview can take 30 min on first boot —
# we want PR status reflecting the CP deploy, with the agent
# relaunch as a signal-only exercise until vdc is warm.
- name: Relaunch dd-local-${{ inputs.env == 'production' && 'prod' || 'preview' }}
if: inputs.relaunch_agent
continue-on-error: true
uses: ./.github/actions/relaunch-agent
with:
kind: ${{ inputs.env == 'production' && 'prod' || 'preview' }}
url: https://${{ inputs.hostname }}
ref: ${{ inputs.ref }}
ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }}
host: ${{ secrets.DD_LOCAL_HOST }}
dd-pat: ${{ secrets.GITHUB_TOKEN }}
ita-api-key: ${{ secrets.DD_ITA_API_KEY }}
Loading
Loading