From 2f8676b317f9ca094eb9ae72cd67b4169c90d523 Mon Sep 17 00:00:00 2001 From: Alex Newman Date: Sat, 18 Apr 2026 17:14:52 +0000 Subject: [PATCH] ci: collapse preview + production deploy into one reusable workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preview and production share scripts/gcp-deploy.sh but each had its own job body in release.yml and production-deploy.yml — three copies of the same health-wait, STONITH, dashboard verify, drifting apart (preview already ran /cp/attest MRTD verify; prod didn't). Extract the common body into .github/workflows/deploy-cp.yml as a reusable workflow. release.yml deploy-preview and production-deploy.yml deploy both call it with env-specific inputs. Prod now runs the stronger MRTD attestation check preview already had, and every PR push exercises the exact code prod uses. Move the SSH+relaunch of dd-local-{kind} into a composite action .github/actions/relaunch-agent/ so deploy-cp.yml can cascade it directly. local-agents.yml shrinks to a workflow_dispatch-only entry point for operator-driven one-shots. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/actions/relaunch-agent/action.yml | 71 +++++ .github/workflows/deploy-cp.yml | 306 ++++++++++++++++++++++ .github/workflows/local-agents.yml | 225 ++-------------- .github/workflows/production-deploy.yml | 137 ++-------- .github/workflows/release.yml | 236 ++--------------- 5 files changed, 432 insertions(+), 543 deletions(-) create mode 100644 .github/actions/relaunch-agent/action.yml create mode 100644 .github/workflows/deploy-cp.yml diff --git a/.github/actions/relaunch-agent/action.yml b/.github/actions/relaunch-agent/action.yml new file mode 100644 index 0000000..a58a048 --- /dev/null +++ b/.github/actions/relaunch-agent/action.yml @@ -0,0 +1,71 @@ +name: Relaunch local TDX agent +description: >- + SSH into the tdx2 host and recreate the matching dd-local-{kind} libvirt + domain against the given CP url, pulling scripts from the given git ref. + Shared between Local Agents (push/PR/dispatch) and Deploy CP (cascading + relaunch after a successful CP deploy). + +inputs: + kind: + description: 'prod | preview — which libvirt domain to relaunch' + required: true + url: + description: 'CP URL the agent should register against (e.g. https://app.devopsdefender.com)' + required: true + ref: + description: 'git ref whose scripts/apps tree dd-relaunch.sh should check out on the host' + required: true + ssh-key: + description: 'Private SSH key for tdx2@host' + required: true + host: + description: 'Public host address of the tdx2 node' + required: true + dd-pat: + description: 'GitHub PAT the agent uses to talk to the CP' + required: true + ita-api-key: + description: 'Intel Trust Authority API key for attestation' + required: true + +runs: + using: composite + steps: + # CP must be reachable before we SSH — on PR pushes we race with + # Release's deploy-preview standing up the pr-N CP. /health is public. + - name: Wait for CP to be healthy + shell: bash + env: + URL: ${{ inputs.url }} + run: | + for i in $(seq 1 60); do + if curl -fsS --max-time 5 "$URL/health" >/dev/null 2>&1; then + echo "CP $URL healthy after ${i} attempts" + exit 0 + fi + echo " waiting for $URL... (${i}/60)" + sleep 10 + done + echo "::error::CP $URL never came up within 10 min" + exit 1 + + # SSH in and relaunch the VM (destroy + redefine + start). Finishes + # in ~10 s — the baked config.iso's EE_BOOT_WORKLOADS drives the rest. + - name: ssh + relaunch VM + shell: bash + env: + SSH_KEY: ${{ inputs.ssh-key }} + HOST: ${{ inputs.host }} + DD_PAT: ${{ inputs.dd-pat }} + DD_ITA_API_KEY: ${{ inputs.ita-api-key }} + KIND: ${{ inputs.kind }} + URL: ${{ inputs.url }} + REF: ${{ inputs.ref }} + run: | + mkdir -p ~/.ssh + printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null + ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \ + -i ~/.ssh/id_ed25519 "tdx2@$HOST" \ + "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL' '$REF'" diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml new file mode 100644 index 0000000..94ea1d1 --- /dev/null +++ b/.github/workflows/deploy-cp.yml @@ -0,0 +1,306 @@ +name: Deploy CP + +# Reusable workflow: provision the CP TDX VM on GCP, wait for it to be +# healthy, verify attestation + dashboard + STONITH, and cascade a +# relaunch of the matching dd-local agent VM. Called from release.yml +# (preview path) and production-deploy.yml (prod path) with different +# inputs — both paths share this exact set of verification steps, so +# preview CI exercises the same code that prod runs. +# +# GitHub Actions allows ≤4 levels of workflow_call nesting. Today's +# chain is `release.yml → deploy-cp.yml` (2) and +# `production-deploy.yml → deploy-cp.yml` (2) — deep enough headroom +# that we can still call one more reusable workflow below us if needed. +# The agent-relaunch cascade uses a composite action (same-job, no +# nesting) to keep that headroom. + +on: + workflow_call: + inputs: + env: + description: 'DD_ENV (e.g. "production", "pr-42")' + required: true + type: string + hostname: + description: 'Public hostname (e.g. app.devopsdefender.com)' + required: true + type: string + gcp_environment: + description: 'GitHub environment name — "production" | "staging"' + required: true + type: string + workload_identity_provider: + description: 'GCP Workload Identity Federation provider resource name' + required: true + type: string + service_account: + description: 'GCP service account email' + required: true + type: string + release_tag: + description: 'devopsdefender release tag to deploy (e.g. "latest", "pr-abc123")' + required: true + type: string + oauth_enabled: + description: 'Enable GitHub OAuth (prod only; previews use PAT)' + required: false + type: boolean + default: false + comment_on_pr: + description: 'Leave a PR comment with the preview URL' + required: false + type: boolean + default: false + relaunch_agent: + description: 'After CP deploy, cascade a relaunch of dd-local-{env} via SSH' + required: false + type: boolean + default: true + ref: + description: 'Git ref the tdx2 host should pull before relaunching the agent VM' + required: false + type: string + default: main + +concurrency: + group: deploy-cp-${{ inputs.env }} + cancel-in-progress: false + +jobs: + deploy: + runs-on: ubuntu-latest + environment: ${{ inputs.gcp_environment }} + permissions: + contents: read + id-token: write + pull-requests: write + env: + DD_ENV: ${{ inputs.env }} + DD_HOSTNAME: ${{ inputs.hostname }} + GCP_ZONE: us-central1-c + steps: + - uses: actions/checkout@v4 + + - uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ inputs.workload_identity_provider }} + service_account: ${{ inputs.service_account }} + - uses: google-github-actions/setup-gcloud@v2 + + - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} + CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} + # OAuth only in environments that have these set (production). + # When empty, gcp-deploy.sh omits the workload env vars → + # dd-web disables /auth/github/* and serves /auth/pat only. + DD_GITHUB_CLIENT_ID: ${{ inputs.oauth_enabled && (vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID) || '' }} + DD_GITHUB_CALLBACK_URL: ${{ inputs.oauth_enabled && vars.DD_GITHUB_CALLBACK_URL || '' }} + DD_GITHUB_CLIENT_SECRET: ${{ inputs.oauth_enabled && secrets.DD_GITHUB_CLIENT_SECRET || '' }} + # ITA — optional. When set, the CP mints + verifies quotes. + DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} + DD_RELEASE_TAG: ${{ inputs.release_tag }} + run: scripts/gcp-deploy.sh + + - name: Wait for agent health (streams serial console) + env: + AGENT_URL: https://${{ inputs.hostname }} + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + VM_NAME=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + if [ -z "$VM_NAME" ]; then + echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed" + exit 1 + fi + echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)" + + LAST_LINES=0 + for i in $(seq 1 60); do + # Stream serial console so boot failures (DHCP hang, release + # fetch error, cloudflared exit, etc.) are visible without + # shelling into GCP. + gcloud compute instances get-serial-port-output "$VM_NAME" \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \ + > /tmp/serial.log || true + TOTAL_LINES=$(wc -l < /tmp/serial.log) + if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then + tail -n +$((LAST_LINES + 1)) /tmp/serial.log \ + | sed 's/^/[serial] /' + LAST_LINES=$TOTAL_LINES + fi + + if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then + echo "::error::boot failed — serial log shows fatal pattern" + exit 1 + fi + + if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then + echo "Agent healthy at ${AGENT_URL}" + exit 0 + fi + echo " waiting for tunnel... (${i}/60)" + sleep 5 + done + echo "::error::Agent not healthy within 5 minutes" + echo "--- final serial tail ---" + tail -80 /tmp/serial.log | sed 's/^/[serial] /' + exit 1 + + - name: Verify NEW VM via TDX attestation + env: + AGENT_URL: https://${{ inputs.hostname }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # /cp/attest proves the freshly-deployed VM is serving the tunnel + # (stale tunnels point at old VMs that 404 on this endpoint). + # MRTD = 48 bytes at offset 184 in TDX quote v4; if non-zero, + # attestation actually worked. + NONCE=$(openssl rand -base64 16) + for attempt in $(seq 1 60); do + BODY=$(curl -sG -w '\n%{http_code}' \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + --data-urlencode "nonce=${NONCE}" \ + "${AGENT_URL}/cp/attest" || echo $'\n000') + CODE=$(echo "$BODY" | tail -n1) + JSON=$(echo "$BODY" | sed '$d') + if [ "$CODE" = "200" ]; then + QUOTE_B64=$(echo "$JSON" | jq -r '.quote_b64 // empty') + if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then + MRTD=$(echo "$QUOTE_B64" | base64 -d \ + | dd bs=1 skip=184 count=48 status=none | xxd -p -c 48) + if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then + echo "NEW VM verified — MRTD: $MRTD" + exit 0 + fi + echo " /cp/attest 200 but MRTD empty/zero, retrying... (${attempt}/60)" + else + echo " /cp/attest 200 but no quote_b64, retrying... (${attempt}/60)" + fi + else + echo " /cp/attest returned HTTP ${CODE}, retrying... (${attempt}/60)" + fi + sleep 10 + done + echo "::error::/cp/attest never returned a valid quote — stale tunnel or new VM never came up" + exit 1 + + - name: Verify dashboard renders + env: + AGENT_URL: https://${{ inputs.hostname }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Fast sanity check on top of /cp/attest — proves dd-web is up + # and accepts the CI PAT's Bearer auth. + for attempt in $(seq 1 12); do + code=$(curl -s -o /dev/null -w '%{http_code}' \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + "${AGENT_URL}/" || echo 000) + if [ "$code" = "200" ]; then + echo "Dashboard renders (HTTP 200, attempt ${attempt})" + exit 0 + fi + echo " dashboard returned HTTP ${code}, retrying... (${attempt}/12)" + sleep 5 + done + echo "::error::dashboard / never returned 200 (last HTTP ${code})" + exit 1 + + - name: Verify STONITH halted prior VM(s) in this env + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + # dd-register STONITHs the old VM on startup by deleting its + # CF tunnel → old cloudflared exits → old dd-register poweroffs. + # Scoped to this env — per-PR previews are hostname-isolated, + # so this only reaps prior deploys of the same env. + NEW_VM=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + echo "new VM: $NEW_VM" + SURVIVORS="" + for i in $(seq 1 24); do + SURVIVORS=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ + --format="value(name)" \ + | grep -vx "$NEW_VM" || true) + if [ -z "$SURVIVORS" ]; then + echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" + exit 0 + fi + echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" + echo " waiting for STONITH poweroff... (${i}/24)" + sleep 5 + done + echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" + echo "$SURVIVORS" + # shellcheck disable=SC2086 + gcloud compute instances delete $SURVIVORS \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true + echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM" + + - name: Comment preview URL on PR + if: inputs.comment_on_pr && github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const url = `https://${{ inputs.hostname }}`; + const body = [ + `### DD preview ready`, + ``, + `**URL:** ${url}`, + ``, + `Browser login: paste \`gh auth token\` output at ${url}/auth/pat`, + ``, + `CLI / curl: \`curl -H "Authorization: Bearer $(gh auth token)" ${url}/\``, + ``, + `Register endpoint for a local agent: \`wss://${{ inputs.hostname }}/register\``, + ].join('\n'); + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + const marker = '### DD preview ready'; + const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } + + # Cascade a relaunch of the matching dd-local-{env} libvirt domain + # on the tdx2 host. Preview runs dd-local-preview against the PR's + # CP; prod runs dd-local-prod against app.devopsdefender.com. + # Non-blocking (`continue-on-error`) because the openclaw boot + # chain inside dd-local-preview can take 30 min on first boot — + # we want PR status reflecting the CP deploy, with the agent + # relaunch as a signal-only exercise until vdc is warm. + - name: Relaunch dd-local-${{ inputs.env == 'production' && 'prod' || 'preview' }} + if: inputs.relaunch_agent + continue-on-error: true + uses: ./.github/actions/relaunch-agent + with: + kind: ${{ inputs.env == 'production' && 'prod' || 'preview' }} + url: https://${{ inputs.hostname }} + ref: ${{ inputs.ref }} + ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} + host: ${{ secrets.DD_LOCAL_HOST }} + dd-pat: ${{ secrets.GITHUB_TOKEN }} + ita-api-key: ${{ secrets.DD_ITA_API_KEY }} diff --git a/.github/workflows/local-agents.yml b/.github/workflows/local-agents.yml index 44fb3f0..9a32b46 100644 --- a/.github/workflows/local-agents.yml +++ b/.github/workflows/local-agents.yml @@ -1,33 +1,14 @@ name: Local Agents -# Relaunches the local TDX agent VM on this user's host whenever the -# corresponding CP gets new code: -# - Production Deploy success → reboot dd-local-prod against app.devopsdefender.com -# - Release success on a PR → reboot dd-local-preview against pr-N.devopsdefender.com -# - Open PR's push → reboot dd-local-preview against pr-N, pulling -# the PR's ref so e2e tests exercise the PR's -# scripts/apps/ tree, not main's. -# -# SSHs in via key auth to a public-IP host, then invokes -# scripts/dd-relaunch.sh which handles the destroy/recreate cycle. +# Manual entry point for relaunching one of the local dd-local-{kind} +# libvirt domains on the tdx2 host. The everyday path (prod redeploy, +# preview PR push) now goes through deploy-cp.yml, which calls the +# relaunch-agent composite action directly after a successful CP +# deploy — so this workflow only exists for operator-driven one-shots: +# iterating on scripts/dd-relaunch.sh, re-running a relaunch without +# re-deploying the CP, etc. on: - workflow_run: - workflows: ["Release", "Production Deploy"] - types: [completed] - # Every non-README push to main fires a prod relaunch directly. - push: - branches: [main] - paths-ignore: - - "README.md" - # PR updates fire a preview relaunch on dd-local-preview, using the - # PR's own scripts/local-agents.sh + apps/ tree. End-to-end tests - # the PR's deployment code (not main's). Races with Release's - # deploy-preview that creates the pr-N CP; we poll that CP's /health - # before SSHing so the agent registers cleanly. - pull_request: - paths-ignore: - - "README.md" workflow_dispatch: inputs: kind: @@ -38,193 +19,29 @@ on: description: 'CP URL (e.g. https://app.devopsdefender.com)' required: true default: 'https://app.devopsdefender.com' + ref: + description: 'git ref whose scripts/apps tree to check out on the host' + required: true + default: 'main' permissions: contents: read - pull-requests: read concurrency: - group: local-agents-${{ github.event.pull_request.number || github.event.workflow_run.name || github.event.inputs.kind || 'prod' }} + group: local-agents-${{ github.event.inputs.kind }} cancel-in-progress: false jobs: relaunch: - if: | - github.event_name == 'workflow_dispatch' - || github.event_name == 'push' - || github.event_name == 'pull_request' - || github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - id: pick - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - EVENT: ${{ github.event_name }} - WF: ${{ github.event.workflow_run.name }} - BRANCH: ${{ github.event.workflow_run.head_branch }} - DISPATCH_KIND: ${{ github.event.inputs.kind }} - DISPATCH_URL: ${{ github.event.inputs.cp_url }} - PR_NUMBER: ${{ github.event.pull_request.number }} - PR_HEAD_REF: ${{ github.event.pull_request.head.ref }} - run: | - if [ "$EVENT" = "workflow_dispatch" ]; then - echo "kind=$DISPATCH_KIND" >> "$GITHUB_OUTPUT" - echo "url=$DISPATCH_URL" >> "$GITHUB_OUTPUT" - echo "ref=main" >> "$GITHUB_OUTPUT" - elif [ "$EVENT" = "pull_request" ]; then - # PR update → test dd-local-preview against the PR's CP - # using the PR's own scripts/apps. Exercises the full - # containerised boot chain end-to-end. - echo "kind=preview" >> "$GITHUB_OUTPUT" - echo "url=https://pr-$PR_NUMBER.devopsdefender.com" >> "$GITHUB_OUTPUT" - echo "ref=$PR_HEAD_REF" >> "$GITHUB_OUTPUT" - elif [ "$EVENT" = "push" ] || [ "$WF" = "Production Deploy" ]; then - # push-to-main on local-agent scripts, or a prod CP redeploy - # → relaunch dd-local-prod against the live prod CP. - echo "kind=prod" >> "$GITHUB_OUTPUT" - echo "url=https://app.devopsdefender.com" >> "$GITHUB_OUTPUT" - echo "ref=main" >> "$GITHUB_OUTPUT" - else - # Release on a PR: derive pr-N. Released-on-main returns - # no open PR → skip (Production Deploy will fire shortly). - pr=$(gh pr list --head "$BRANCH" --state open \ - --repo "${{ github.repository }}" \ - --json number --jq '.[0].number' 2>/dev/null || true) - if [ -n "$pr" ]; then - echo "kind=preview" >> "$GITHUB_OUTPUT" - echo "url=https://pr-$pr.devopsdefender.com" >> "$GITHUB_OUTPUT" - echo "ref=$BRANCH" >> "$GITHUB_OUTPUT" - else - echo "kind=skip" >> "$GITHUB_OUTPUT" - fi - fi - - # Wait for the target CP to be reachable before relaunching. - # On pull_request we fire in parallel with Release's deploy-preview - # which takes ~3 min to stand up the pr-N CP; without this wait the - # agent would start registering against a CP that doesn't exist yet - # and fail with DNS errors. /health is public; no auth needed. - - name: Wait for CP to be healthy - if: steps.pick.outputs.kind != 'skip' - env: - URL: ${{ steps.pick.outputs.url }} - run: | - for i in $(seq 1 60); do - if curl -fsS --max-time 5 "$URL/health" >/dev/null 2>&1; then - echo "CP $URL healthy after ${i} attempts" - exit 0 - fi - echo " waiting for $URL... (${i}/60)" - sleep 10 - done - echo "::error::CP $URL never came up within 10 min" - exit 1 - - # SSH in and relaunch the VM (destroy + redefine + start). Finishes - # in ~10 s — the baked config.iso's EE_BOOT_WORKLOADS now includes - # podman + ollama + openclaw + cloudflared + dd-agent, so there's - # no separate HTTPS deploy step needed. Workloads self-sequence via - # `until` loops in their cmd scripts (see apps/*/workload.json). - - name: ssh + relaunch VM - if: steps.pick.outputs.kind != 'skip' - env: - SSH_KEY: ${{ secrets.DD_LOCAL_SSH_KEY }} - HOST: ${{ secrets.DD_LOCAL_HOST }} - DD_PAT: ${{ secrets.GITHUB_TOKEN }} - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - KIND: ${{ steps.pick.outputs.kind }} - URL: ${{ steps.pick.outputs.url }} - REF: ${{ steps.pick.outputs.ref }} - run: | - mkdir -p ~/.ssh - printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519 - chmod 600 ~/.ssh/id_ed25519 - ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null - ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \ - -i ~/.ssh/id_ed25519 "tdx2@$HOST" \ - "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL' '$REF'" - - # Verify dd-local-preview registered and openclaw is responding. - # Only runs on pull_request + workflow_dispatch kind=preview — - # those are the flows where we own the VM's full lifecycle in - # this workflow run. For prod flows we'd rather not block a CP - # redeploy on openclaw readiness. - # Non-blocking: dd-local-preview doesn't currently bake ollama/openclaw - # into its boot workloads (see local-agents.sh — only nv/mount/cf/dd-agent). - # Until the preview VM carries the full apps/ tree, treat this step as a - # signal-only probe so SSH+relaunch success gates the PR. - - name: Verify dd-local-preview openclaw round-trip - continue-on-error: true - if: | - steps.pick.outputs.kind == 'preview' - && (github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch') - env: - DD_PAT: ${{ secrets.GITHUB_TOKEN }} - URL: ${{ steps.pick.outputs.url }} - run: | - AUTH=(-H "Authorization: Bearer $DD_PAT") - # Find the freshly-registered dd-local-preview agent. - started_at=$(date -u +%Y-%m-%dT%H:%M:%SZ) - agent_host="" - for i in $(seq 1 60); do - agent_host=$(curl -fsS "${AUTH[@]}" "$URL/api/agents" 2>/dev/null \ - | jq -r --arg since "$started_at" ' - [.[] | select(.vm_name=="dd-local-preview" and .status=="healthy" and .last_seen > $since)] - | sort_by(.last_seen) | reverse | .[0].hostname // empty' 2>/dev/null || true) - [ -n "$agent_host" ] && [ "$agent_host" != "null" ] && break - sleep 10 - done - if [ -z "$agent_host" ] || [ "$agent_host" = "null" ]; then - echo "::error::dd-local-preview never registered with $URL within 10 min" - exit 1 - fi - echo " agent: https://$agent_host" - # Poll openclaw /healthz via podman exec through the agent's /exec. - # First run pulls ~900 MB image + qwen2.5:0.5b (~400 MB) + npm installs - # @ollama/openclaw. 15 min ceiling covers a cold cache. - # First-boot: ~900 MB image pull + qwen2.5:0.5b (~400 MB) + npm - # install of @ollama/openclaw. 30-min ceiling covers a fully - # cold vdc. Cached-run completes in <2 min. - # - # Every 6th tick (1 min) probe what's actually running inside - # the ollama container via /exec — gives us live visibility - # into where in the ollama → openclaw launch chain we're stuck. - probe() { - echo " -- probe t+${1}s:" - # Is openclaw process running inside the container? - curl -fsS --max-time 15 "${AUTH[@]}" -H 'content-type: application/json' \ - "https://$agent_host/exec" \ - -d '{"cmd":["/var/lib/easyenclave/bin/dd-podman","exec","ollama","sh","-c","ps -ef 2>/dev/null | grep -E \"openclaw|ollama\" | grep -v grep | head -10"],"timeout_secs":15}' \ - 2>/dev/null | jq -r '.stdout // ""' | sed 's/^/ /' - # What models has ollama seen? - curl -fsS --max-time 15 "${AUTH[@]}" -H 'content-type: application/json' \ - "https://$agent_host/exec" \ - -d '{"cmd":["/var/lib/easyenclave/bin/dd-podman","exec","ollama","ollama","list"],"timeout_secs":15}' \ - 2>/dev/null | jq -r '.stdout // ""' | sed 's/^/ [ollama list] /' - } - for i in $(seq 1 180); do - resp=$(curl -fsS --max-time 30 "${AUTH[@]}" -H 'content-type: application/json' \ - "https://$agent_host/exec" \ - -d '{"cmd":["/var/lib/easyenclave/bin/dd-podman","exec","ollama","curl","-fsS","http://127.0.0.1:18789/healthz"],"timeout_secs":15}' \ - 2>/dev/null || true) - if echo "$resp" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo " openclaw /healthz 200" - echo "$resp" | jq -r '.stdout // ""' | head -c 400 - echo - exit 0 - fi - echo " waiting for openclaw gateway... (${i}/180)" - if [ $((i % 6)) -eq 0 ]; then - probe $((i * 10)) || true - fi - sleep 10 - done - echo "::error::openclaw /healthz never returned 200 within 30 min" - # Dump openclaw workload log tail for post-mortem. - echo "--- openclaw workload log ---" - curl -fsS --max-time 30 "${AUTH[@]}" \ - "https://$agent_host/exec" \ - -d '{"cmd":["/var/lib/easyenclave/bin/dd-podman","exec","ollama","sh","-c","ps -ef | grep -i openclaw; ls -la /root/.openclaw 2>&1 | head -20"],"timeout_secs":15}' \ - 2>/dev/null | jq -r '.stdout // .stderr // ""' - exit 1 + - uses: ./.github/actions/relaunch-agent + with: + kind: ${{ github.event.inputs.kind }} + url: ${{ github.event.inputs.cp_url }} + ref: ${{ github.event.inputs.ref }} + ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} + host: ${{ secrets.DD_LOCAL_HOST }} + dd-pat: ${{ secrets.GITHUB_TOKEN }} + ita-api-key: ${{ secrets.DD_ITA_API_KEY }} diff --git a/.github/workflows/production-deploy.yml b/.github/workflows/production-deploy.yml index 8253179..04f91a6 100644 --- a/.github/workflows/production-deploy.yml +++ b/.github/workflows/production-deploy.yml @@ -7,6 +7,9 @@ name: Production Deploy # we don't promote. # - workflow_dispatch: manual re-deploy of any existing tag (e.g. a # known-good v0.2.0 after a bad main push). +# +# Body lives in deploy-cp.yml — same workflow PR previews use, so every +# PR exercises the prod deploy path before it lands here. on: workflow_run: @@ -20,127 +23,25 @@ on: required: false default: 'latest' -concurrency: - group: dd-production - cancel-in-progress: false - -env: - GCP_ZONE: us-central1-c - DD_ENV: production - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - permissions: contents: read jobs: - # dd-register STONITHs the old VM on startup by deleting its CF - # tunnel, so no explicit teardown here. deploy: - # workflow_run fires on every Release completion, including - # failures. Only promote on success. + # workflow_run fires on every Release completion, including failures. + # Only promote on success. if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' - runs-on: ubuntu-latest - environment: production - permissions: - contents: read - id-token: write - steps: - - uses: actions/checkout@v4 - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' - - uses: google-github-actions/setup-gcloud@v2 - - - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - DD_GITHUB_CLIENT_ID: ${{ vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID }} - DD_GITHUB_CALLBACK_URL: ${{ vars.DD_GITHUB_CALLBACK_URL }} - DD_GITHUB_CLIENT_SECRET: ${{ secrets.DD_GITHUB_CLIENT_SECRET }} - # Intel Trust Authority — optional. When the secret is set, - # the CP mints its own ITA token and verifies incoming agent - # registrations. DD_ITA_REQUIRED stays false (default). - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - # workflow_run has no `inputs`; fall back to `latest`, which - # release.yml just (re)published on push to main. - DD_RELEASE_TAG: ${{ inputs.release_tag || 'latest' }} - run: scripts/gcp-deploy.sh - - - name: Wait for agent health - env: - AGENT_URL: https://app.${{ env.DD_DOMAIN }} - run: | - for i in $(seq 1 60); do - curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1 && { - echo "Agent healthy at ${AGENT_URL}" - exit 0 - } - echo " waiting for tunnel... (${i}/60)" - sleep 5 - done - echo "::error::Agent not healthy within 5 minutes" - exit 1 - - - name: Verify dashboard renders - env: - AGENT_URL: https://app.${{ env.DD_DOMAIN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # New auth model: dashboard expects a GitHub PAT/GITHUB_TOKEN with - # access to the dd repo; the CP verifies against DD_OWNER via the - # standard /user + /repos/{owner}/dd fallback. No OIDC audience wiring. - for attempt in $(seq 1 12); do - code=$(curl -s -o /dev/null -w '%{http_code}' \ - -H "Authorization: Bearer ${GITHUB_TOKEN}" \ - "${AGENT_URL}/" || echo 000) - if [ "$code" = "200" ]; then - echo "Dashboard renders (HTTP 200, attempt ${attempt})" - exit 0 - fi - echo " dashboard returned HTTP ${code}, retrying... (${attempt}/12)" - sleep 5 - done - echo "::error::dashboard / never returned 200 (last HTTP ${code})" - exit 1 - - - name: Verify STONITH halted prior production VM(s) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - # Mirror of release.yml's verify-step for PR previews. Give - # STONITH-by-tunnel-delete 120s to work on well-behaved old - # prod VMs (their cloudflared exits → dd-register poweroffs - # → GCP TERMINATED → cleanup.yml reaps). After the timeout, - # force-delete any remaining RUNNING prod VMs so we don't - # leak compute indefinitely. - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=production" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=production AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in prod" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out in prod; force-deleting:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only production VM" + uses: ./.github/workflows/deploy-cp.yml + with: + env: production + hostname: app.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + gcp_environment: production + workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' + service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' + # workflow_run has no `inputs`; fall back to `latest`, which + # release.yml just (re)published on push to main. + release_tag: ${{ inputs.release_tag || 'latest' }} + oauth_enabled: true + comment_on_pr: false + ref: main + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b238c85..3647b34 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,10 +31,6 @@ permissions: id-token: write attestations: write -env: - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - GCP_ZONE: us-central1-c - jobs: build: runs-on: ubuntu-latest @@ -121,223 +117,21 @@ jobs: # Each PR gets its own env at pr-{N}.{domain} with DD_ENV=pr-{N} # (hostname-isolated, no OAuth — browser access via /auth/pat). # main/v* produce releases that production-deploy picks up separately. + # + # Body lives in deploy-cp.yml — same workflow prod uses, so every PR + # exercises the prod deploy path. deploy-preview: if: github.event_name == 'pull_request' needs: build - runs-on: ubuntu-latest - environment: staging - permissions: - contents: read - id-token: write - pull-requests: write - env: - DD_ENV: pr-${{ github.event.number }} - DD_HOSTNAME: pr-${{ github.event.number }}.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - steps: - - uses: actions/checkout@v4 - - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' - - uses: google-github-actions/setup-gcloud@v2 - - - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - # OAuth env vars intentionally omitted — gcp-deploy.sh sees - # empty DD_GITHUB_CLIENT_ID and skips them in the workload - # spec. dd-web then disables /auth/github/* and serves - # /auth/pat for browser access. - # - # Intel Trust Authority — optional. When the secret is set, - # the CP mints its own ITA token at startup and verifies - # agent-supplied tokens on /register. DD_ITA_REQUIRED stays - # false (default) so unsigned agents still register. - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - DD_RELEASE_TAG: ${{ needs.build.outputs.tag }} - run: scripts/gcp-deploy.sh - - - name: Wait for agent health (streams serial console) - env: - AGENT_URL: https://${{ env.DD_HOSTNAME }} - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - VM_NAME=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - if [ -z "$VM_NAME" ]; then - echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed" - exit 1 - fi - echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)" - - LAST_LINES=0 - for i in $(seq 1 60); do - # Stream serial console so boot failures (DHCP hang, GitHub - # release fetch error, cloudflared exit, etc.) are visible - # without shelling into GCP. - gcloud compute instances get-serial-port-output "$VM_NAME" \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \ - > /tmp/serial.log || true - TOTAL_LINES=$(wc -l < /tmp/serial.log) - if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then - tail -n +$((LAST_LINES + 1)) /tmp/serial.log \ - | sed 's/^/[serial] /' - LAST_LINES=$TOTAL_LINES - fi - - if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then - echo "::error::boot failed — serial log shows fatal pattern" - exit 1 - fi - - # /health via the Cloudflare tunnel tests the full chain: - # VM boot → easyenclave init → github_release fetch of dd + - # cloudflared → cloudflared tunnel up. - if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then - echo "Agent healthy at ${AGENT_URL}" - exit 0 - fi - echo " waiting for tunnel... (${i}/60)" - sleep 5 - done - echo "::error::Agent not healthy within 5 minutes" - echo "--- final serial tail ---" - tail -80 /tmp/serial.log | sed 's/^/[serial] /' - exit 1 - - - name: Verify NEW VM via TDX attestation - env: - AGENT_URL: https://${{ env.DD_HOSTNAME }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # /cp/attest proves the freshly-deployed VM is serving the tunnel - # (stale tunnels point at old VMs that 404 on this endpoint). - # Auth: GITHUB_TOKEN via Bearer — the CP's /repos/{owner}/dd probe - # accepts any token with repo access. No OIDC audience wiring. - NONCE=$(openssl rand -base64 16) - - # 60 × 10s = 10 min. New VM has to boot, fetch cloudflared - # and dd from GitHub releases, start, and bring its tunnel up. - for attempt in $(seq 1 60); do - BODY=$(curl -sG -w '\n%{http_code}' \ - -H "Authorization: Bearer ${GITHUB_TOKEN}" \ - --data-urlencode "nonce=${NONCE}" \ - "${AGENT_URL}/cp/attest" || echo $'\n000') - CODE=$(echo "$BODY" | tail -n1) - JSON=$(echo "$BODY" | sed '$d') - if [ "$CODE" = "200" ]; then - QUOTE_B64=$(echo "$JSON" | jq -r '.quote_b64 // empty') - if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then - # MRTD = 48 bytes at offset 184 in TDX quote v4. - # If it's non-zero, attestation actually worked. - MRTD=$(echo "$QUOTE_B64" | base64 -d \ - | dd bs=1 skip=184 count=48 status=none | xxd -p -c 48) - if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then - echo "NEW VM verified — MRTD: $MRTD" - exit 0 - fi - echo " /cp/attest 200 but MRTD empty/zero, retrying... (${attempt}/60)" - else - echo " /cp/attest 200 but no quote_b64, retrying... (${attempt}/60)" - fi - else - echo " /cp/attest returned HTTP ${CODE}, retrying... (${attempt}/60)" - fi - sleep 10 - done - echo "::error::/cp/attest never returned a valid quote — stale tunnel or new VM never came up" - exit 1 - - - name: Verify STONITH halted prior VM(s) in this env - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - # STONITH (dd-register deletes the old tunnel → old cloudflared - # exits → old dd-register poweroffs the VM) is the ONLY cleanup - # mechanism. Scoped to this PR's env — previews are - # hostname-isolated from each other, so this only reaps prior - # deploys of the same PR (re-pushes). - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - - # Give STONITH-by-tunnel-delete 120s to work on well-behaved - # old VMs (their cloudflared exits → dd-register poweroffs). - # After that, force-delete any remaining survivors: they're - # zombies whose dd-register failed before creating a tunnel - # (e.g. CF auth error at boot — see src/cp.rs - # which now kernel_poweroff's on init failure, so this is a - # safety net for pre-fix zombies and any future init failure - # modes we haven't handled). - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in dd_env=${DD_ENV}" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only $DD_ENV VM" - - - name: Comment preview URL on PR - uses: actions/github-script@v7 - with: - script: | - const url = `https://${process.env.DD_HOSTNAME}`; - const body = [ - `### DD preview ready`, - ``, - `**URL:** ${url}`, - ``, - `Browser login: paste \`gh auth token\` output at ${url}/auth/pat`, - ``, - `CLI / curl: \`curl -H "Authorization: Bearer $(gh auth token)" ${url}/\``, - ``, - `Register endpoint for a local agent: \`wss://${process.env.DD_HOSTNAME}/register\``, - ].join('\n'); - - // Update existing bot comment if present, else create. - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - const marker = '### DD preview ready'; - const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker)); - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body, - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body, - }); - } + uses: ./.github/workflows/deploy-cp.yml + with: + env: pr-${{ github.event.number }} + hostname: pr-${{ github.event.number }}.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + gcp_environment: staging + workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' + service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' + release_tag: ${{ needs.build.outputs.tag }} + oauth_enabled: false + comment_on_pr: true + ref: ${{ github.event.pull_request.head.ref }} + secrets: inherit