From 939e2179336bfb90bc7236f5bd04c71b24f38798 Mon Sep 17 00:00:00 2001 From: Alex Newman Date: Sat, 18 Apr 2026 17:46:44 +0000 Subject: [PATCH] ci: fold every deploy path into Release, drop scripts/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Release now owns the full lifecycle: build → deploy-preview (PR) OR deploy-production (main / dispatch) → relaunch-agent (blocking) → verify agent re-registered with CP. A release is "done" only when the local dd-local-{kind} VM is back online talking to the freshly-deployed CP — that's the signal that tells us a PR is safe to merge or a merge actually shipped. Deleted: .github/workflows/production-deploy.yml — folded into release.yml as a deploy-production job with same deploy-cp.yml body. .github/workflows/local-agents.yml — manual-dispatch path gone; push a commit to trigger a relaunch via the cascade. Deleted scripts/: scripts/gcp-deploy.sh — inlined into deploy-cp.yml. scripts/dd-relaunch.sh → apps/_infra/dd-relaunch.sh (host-side). scripts/local-agents.sh → apps/_infra/local-agents.sh (host-side). scripts/workloads.sh — dead after inline; only gcp-deploy sourced it and local-agents.sh built workloads via inline jq anyway. scripts/redeploy-workload.sh — unused helper, removed. deploy-cp.yml's Relaunch step drops `continue-on-error: true`; the relaunch-agent composite gains a "Verify agent registered with CP" step that polls /api/agents for a freshly-registered dd-local-{kind} entry with a 5-min budget. Concurrency on release.yml becomes expression-driven: PR pushes cancel in-progress runs; main / tag / dispatch queue so an in-flight prod deploy finishes cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/actions/relaunch-agent/action.yml | 42 +++++- .github/workflows/deploy-cp.yml | 97 +++++++++++--- .github/workflows/local-agents.yml | 47 ------- .github/workflows/production-deploy.yml | 53 -------- .github/workflows/release.yml | 71 ++++++++--- README.md | 12 +- apps/_infra/dd-relaunch.sh | 52 ++++++++ {scripts => apps/_infra}/local-agents.sh | 6 +- scripts/dd-relaunch.sh | 57 --------- scripts/gcp-deploy.sh | 148 ---------------------- scripts/redeploy-workload.sh | 50 -------- scripts/workloads.sh | 54 -------- 12 files changed, 227 insertions(+), 462 deletions(-) delete mode 100644 .github/workflows/local-agents.yml delete mode 100644 .github/workflows/production-deploy.yml create mode 100755 apps/_infra/dd-relaunch.sh rename {scripts => apps/_infra}/local-agents.sh (97%) delete mode 100755 scripts/dd-relaunch.sh delete mode 100755 scripts/gcp-deploy.sh delete mode 100755 scripts/redeploy-workload.sh delete mode 100755 scripts/workloads.sh diff --git a/.github/actions/relaunch-agent/action.yml b/.github/actions/relaunch-agent/action.yml index a58a048..b449289 100644 --- a/.github/actions/relaunch-agent/action.yml +++ b/.github/actions/relaunch-agent/action.yml @@ -1,9 +1,9 @@ name: Relaunch local TDX agent description: >- - SSH into the tdx2 host and recreate the matching dd-local-{kind} libvirt - domain against the given CP url, pulling scripts from the given git ref. - Shared between Local Agents (push/PR/dispatch) and Deploy CP (cascading - relaunch after a successful CP deploy). + SSH into the tdx2 host, recreate the matching dd-local-{kind} libvirt + domain against the given CP url (pulling apps/ from the given git ref), + then block until the agent re-registers with the CP. A release is "done" + only when this action succeeds end-to-end. inputs: kind: @@ -68,4 +68,36 @@ runs: ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \ -i ~/.ssh/id_ed25519 "tdx2@$HOST" \ - "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL' '$REF'" + "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/apps/_infra/dd-relaunch.sh '$KIND' '$URL' '$REF'" + + # Block until the freshly-booted agent VM registers with the CP. + # This is the "I can see the local agent deployment worked" signal + # that gates the whole release. 5-min budget covers a cold VM boot + # (~60s) + cloudflared tunnel (~30s) + agent startup + register — + # plenty of headroom. Doesn't probe openclaw/ollama readiness — + # that first-boot pays a 30-min npm-install tax and isn't part + # of the release gate. + - name: Verify agent registered with CP + shell: bash + env: + URL: ${{ inputs.url }} + DD_PAT: ${{ inputs.dd-pat }} + KIND: ${{ inputs.kind }} + run: | + vm="dd-local-$KIND" + started_at=$(date -u +%Y-%m-%dT%H:%M:%SZ) + AUTH=(-H "Authorization: Bearer $DD_PAT") + for i in $(seq 1 30); do + host=$(curl -fsS --max-time 10 "${AUTH[@]}" "$URL/api/agents" 2>/dev/null \ + | jq -r --arg since "$started_at" --arg vm "$vm" ' + [.[] | select(.vm_name==$vm and .status=="healthy" and .last_seen > $since)] + | sort_by(.last_seen) | reverse | .[0].hostname // empty' 2>/dev/null || true) + if [ -n "$host" ] && [ "$host" != "null" ]; then + echo "$vm registered at https://$host" + exit 0 + fi + echo " waiting for $vm to register with $URL... (${i}/30)" + sleep 10 + done + echo "::error::$vm never registered with $URL within 5 min" + exit 1 diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml index a6ab9e7..d21265c 100644 --- a/.github/workflows/deploy-cp.yml +++ b/.github/workflows/deploy-cp.yml @@ -1,18 +1,17 @@ name: Deploy CP # Reusable workflow: provision the CP TDX VM on GCP, wait for it to be -# healthy, verify attestation + dashboard + STONITH, and cascade a -# relaunch of the matching dd-local agent VM. Called from release.yml -# (preview path) and production-deploy.yml (prod path) with different -# inputs — both paths share this exact set of verification steps, so -# preview CI exercises the same code that prod runs. +# healthy, verify attestation + dashboard + STONITH, then cascade a +# relaunch of the matching dd-local agent VM and block until it +# re-registers. Called from release.yml's deploy-preview (PR path) and +# deploy-production (main / dispatch path) with env-specific inputs — +# both paths share this exact set of verification steps so every PR +# exercises the prod deploy code. # # GitHub Actions allows ≤4 levels of workflow_call nesting. Today's -# chain is `release.yml → deploy-cp.yml` (2) and -# `production-deploy.yml → deploy-cp.yml` (2) — deep enough headroom -# that we can still call one more reusable workflow below us if needed. -# The agent-relaunch cascade uses a composite action (same-job, no -# nesting) to keep that headroom. +# chain is `release.yml → deploy-cp.yml` (2). The agent-relaunch +# cascade uses a composite action (same-job, no nesting) to keep +# headroom for future wrapping. on: workflow_call: @@ -95,15 +94,75 @@ jobs: CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} # OAuth only in environments that have these set (production). - # When empty, gcp-deploy.sh omits the workload env vars → - # dd-web disables /auth/github/* and serves /auth/pat only. + # Empty placeholder values get stripped below before baking the + # workload spec, so dd-web disables /auth/github/* and serves + # /auth/pat only in those envs. DD_GITHUB_CLIENT_ID: ${{ inputs.oauth_enabled && (vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID) || '' }} DD_GITHUB_CALLBACK_URL: ${{ inputs.oauth_enabled && vars.DD_GITHUB_CALLBACK_URL || '' }} DD_GITHUB_CLIENT_SECRET: ${{ inputs.oauth_enabled && secrets.DD_GITHUB_CLIENT_SECRET || '' }} - # ITA — optional. When set, the CP mints + verifies quotes. DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} DD_RELEASE_TAG: ${{ inputs.release_tag }} - run: scripts/gcp-deploy.sh + EE_IMAGE_FAMILY: easyenclave-staging + EE_IMAGE_PROJECT: easyenclave + VM_MACHINE_TYPE: c3-standard-4 + VM_DISK_SIZE: 10GB + DD_ITA_BASE_URL: https://api.trustauthority.intel.com + DD_ITA_JWKS_URL: https://portal.trustauthority.intel.com/certs + DD_ITA_ISSUER: https://portal.trustauthority.intel.com + run: | + set -euo pipefail + + VM_NAME="dd-${DD_ENV}-$(date +%s)" + : "${DD_ITA_API_KEY:?set DD_ITA_API_KEY via secrets.DD_ITA_API_KEY}" + export DD_GITHUB_CALLBACK_URL="${DD_GITHUB_CALLBACK_URL:-https://${DD_HOSTNAME}/auth/github/callback}" + + # Bake a workload template: envsubst ${VAR} placeholders and + # strip any "KEY=" env entries that ended up with empty values + # (e.g. OAuth creds in non-prod envs). + bake() { + case "$1" in + *.json.tmpl) + envsubst < "$1" \ + | jq -c 'if .env then .env |= map(select(test("^[^=]+=.+"))) else . end' + ;; + *.json) + jq -c . "$1" + ;; + *) + echo "::error::unknown workload file type: $1" >&2 + return 1 + ;; + esac + } + + # Boot workloads come from apps//workload.{json,json.tmpl}. + # cloudflared fetches the binary onto PATH; dd-management runs + # devopsdefender in DD_MODE=management (CP + dashboard). + EE_BOOT_WORKLOADS=$({ + bake apps/cloudflared/workload.json + bake apps/dd-management/workload.json.tmpl + } | jq -cs '.') + + jq -c -n \ + --arg workloads "$EE_BOOT_WORKLOADS" \ + '{ "EE_BOOT_WORKLOADS": $workloads, "EE_OWNER": "devopsdefender" }' \ + > /tmp/ee-config.json + + gcloud compute instances create "$VM_NAME" \ + --project="$GCP_PROJECT_ID" \ + --zone="$GCP_ZONE" \ + --machine-type="$VM_MACHINE_TYPE" \ + --confidential-compute-type=TDX \ + --maintenance-policy=TERMINATE \ + --boot-disk-size="$VM_DISK_SIZE" \ + --image-family="$EE_IMAGE_FAMILY" \ + --image-project="$EE_IMAGE_PROJECT" \ + --metadata-from-file=ee-config=/tmp/ee-config.json \ + --labels=devopsdefender=managed,dd_env="${DD_ENV}" \ + --tags=dd-management + + rm -f /tmp/ee-config.json + echo "VM: $VM_NAME ($DD_HOSTNAME, release $DD_RELEASE_TAG)" - name: Wait for agent health (streams serial console) env: @@ -287,15 +346,11 @@ jobs: } # Cascade a relaunch of the matching dd-local-{env} libvirt domain - # on the tdx2 host. Preview runs dd-local-preview against the PR's - # CP; prod runs dd-local-prod against app.devopsdefender.com. - # Non-blocking (`continue-on-error`) because the openclaw boot - # chain inside dd-local-preview can take 30 min on first boot — - # we want PR status reflecting the CP deploy, with the agent - # relaunch as a signal-only exercise until vdc is warm. + # on the tdx2 host, then block on it registering with the freshly- + # deployed CP. This is the gate: a release is "done" only when the + # local agent is back online talking to the new CP. - name: Relaunch dd-local-${{ inputs.env == 'production' && 'prod' || 'preview' }} if: inputs.relaunch_agent - continue-on-error: true uses: ./.github/actions/relaunch-agent with: kind: ${{ inputs.env == 'production' && 'prod' || 'preview' }} diff --git a/.github/workflows/local-agents.yml b/.github/workflows/local-agents.yml deleted file mode 100644 index 9a32b46..0000000 --- a/.github/workflows/local-agents.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: Local Agents - -# Manual entry point for relaunching one of the local dd-local-{kind} -# libvirt domains on the tdx2 host. The everyday path (prod redeploy, -# preview PR push) now goes through deploy-cp.yml, which calls the -# relaunch-agent composite action directly after a successful CP -# deploy — so this workflow only exists for operator-driven one-shots: -# iterating on scripts/dd-relaunch.sh, re-running a relaunch without -# re-deploying the CP, etc. - -on: - workflow_dispatch: - inputs: - kind: - description: 'prod | preview' - required: true - default: 'prod' - cp_url: - description: 'CP URL (e.g. https://app.devopsdefender.com)' - required: true - default: 'https://app.devopsdefender.com' - ref: - description: 'git ref whose scripts/apps tree to check out on the host' - required: true - default: 'main' - -permissions: - contents: read - -concurrency: - group: local-agents-${{ github.event.inputs.kind }} - cancel-in-progress: false - -jobs: - relaunch: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: ./.github/actions/relaunch-agent - with: - kind: ${{ github.event.inputs.kind }} - url: ${{ github.event.inputs.cp_url }} - ref: ${{ github.event.inputs.ref }} - ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} - host: ${{ secrets.DD_LOCAL_HOST }} - dd-pat: ${{ secrets.GITHUB_TOKEN }} - ita-api-key: ${{ secrets.DD_ITA_API_KEY }} diff --git a/.github/workflows/production-deploy.yml b/.github/workflows/production-deploy.yml deleted file mode 100644 index f804e02..0000000 --- a/.github/workflows/production-deploy.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: Production Deploy - -# Two triggers: -# - workflow_run: fires automatically after a successful Release run -# on main. Release publishes the `latest` tag, then this workflow -# deploys it to production. Sequential by design — if Release fails, -# we don't promote. -# - workflow_dispatch: manual re-deploy of any existing tag (e.g. a -# known-good v0.2.0 after a bad main push). -# -# Body lives in deploy-cp.yml — same workflow PR previews use, so every -# PR exercises the prod deploy path before it lands here. - -on: - workflow_run: - workflows: ["Release"] - types: [completed] - branches: [main] - workflow_dispatch: - inputs: - release_tag: - description: 'Release tag to deploy (e.g. latest, v0.2.0)' - required: false - default: 'latest' - -permissions: - contents: read - -jobs: - deploy: - # workflow_run fires on every Release completion, including failures. - # Only promote on success. - if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' - permissions: - contents: read - id-token: write - # Granted (though unused — inputs.comment_on_pr=false here) so the - # intersection with deploy-cp.yml's job-level permissions matches. - pull-requests: write - uses: ./.github/workflows/deploy-cp.yml - with: - env: production - hostname: app.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - gcp_environment: production - workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' - # workflow_run has no `inputs`; fall back to `latest`, which - # release.yml just (re)published on push to main. - release_tag: ${{ inputs.release_tag || 'latest' }} - oauth_enabled: true - comment_on_pr: false - ref: main - secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index efe4539..7d55bbc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,13 +1,18 @@ name: Release -# Build the static musl binary, publish it as a GitHub release asset, -# and (on PRs) deploy it to an ephemeral per-PR preview. Replaces the -# Docker build+push pipeline — easyenclave fetches the asset directly -# via its github_release workload source. +# One workflow to rule them all: build the static musl binary, publish +# it as a GitHub release asset, and deploy it to either the PR preview +# (per-PR ephemeral CP at pr-N.domain) or production (app.domain). Both +# paths cascade into a relaunch of the matching dd-local agent VM on +# the tdx2 host, and the Release run only goes green when that agent +# re-registers with the freshly-deployed CP. # -# PR: pre-release tagged pr-{sha12}, then full PR-preview deploy. -# push to main: rolling `latest` release (no deploy — that's production) -# push v* tag: versioned release (no deploy) +# Paths: +# pull_request → build → deploy-preview → dd-local-preview relaunch +# push main → build → deploy-production → dd-local-prod relaunch +# push v* → build only (versioned release, no deploy) +# workflow_dispatch → build → deploy-production (rollback tool; +# release_tag input picks which tag to deploy) on: push: @@ -18,10 +23,18 @@ on: pull_request: paths-ignore: - "README.md" + workflow_dispatch: + inputs: + release_tag: + description: 'Release tag to deploy to production (rollback tool; default: latest)' + required: false + default: 'latest' concurrency: group: dd-release-${{ github.ref }} - cancel-in-progress: true + # PR pushes cancel old runs. Main / tag / manual dispatch queue — + # we never want to cancel an in-progress prod deploy. + cancel-in-progress: ${{ github.event_name == 'pull_request' }} permissions: contents: write @@ -75,10 +88,7 @@ jobs: # `https://github.com/devopsdefender/dd/.github/workflows/release.yml@`). # The attestation is stored on the repo's /attestations endpoint # and retrievable via `gh attestation verify` or the REST API. - # - # For now we're tracking (not enforcing) — the CP will eventually - # use this to verify that a registering agent's artifact came - # from this workflow. Skipped on fork PRs (they lack id-token). + # Skipped on fork PRs (they lack id-token). - name: Attest devopsdefender binary if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository uses: actions/attest-build-provenance@v2 @@ -113,13 +123,8 @@ jobs: | tail -n +12 \ | xargs -rI{} gh release delete {} --yes --cleanup-tag - # Deploy the freshly-built binary to the PR's ephemeral preview. - # Each PR gets its own env at pr-{N}.{domain} with DD_ENV=pr-{N} - # (hostname-isolated, no OAuth — browser access via /auth/pat). - # main/v* produce releases that production-deploy picks up separately. - # - # Body lives in deploy-cp.yml — same workflow prod uses, so every PR - # exercises the prod deploy path. + # Per-PR ephemeral preview at pr-{N}.{domain}. No OAuth (browser login + # via /auth/pat). Cascades into dd-local-preview relaunch. deploy-preview: if: github.event_name == 'pull_request' needs: build @@ -139,3 +144,31 @@ jobs: comment_on_pr: true ref: ${{ github.event.pull_request.head.ref }} secrets: inherit + + # Production deploy at app.{domain}. Fires on push-to-main OR on a + # manual workflow_dispatch (rollback to a specific release_tag). + # Tag pushes (v*) intentionally do not auto-deploy — they just + # publish the artifact. Cascades into dd-local-prod relaunch. + deploy-production: + if: >- + (github.event_name == 'push' && github.ref == 'refs/heads/main') + || github.event_name == 'workflow_dispatch' + needs: build + permissions: + contents: read + id-token: write + # Granted (though unused — comment_on_pr=false here) so the + # permissions intersection with deploy-cp.yml's job matches. + pull-requests: write + uses: ./.github/workflows/deploy-cp.yml + with: + env: production + hostname: app.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + gcp_environment: production + workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' + service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' + release_tag: ${{ inputs.release_tag || 'latest' }} + oauth_enabled: true + comment_on_pr: false + ref: main + secrets: inherit diff --git a/README.md b/README.md index 289d19c..f8a1df1 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ The `devopsdefender` binary ships as a **GitHub release asset** — not an OCI i `cloudflared` is also pulled directly from `cloudflare/cloudflared`'s GitHub releases as a fetch-only boot workload — no bundling in our image, no Dockerfile step. -Per-VM configuration (CF credentials, GitHub OAuth, the workload spec itself) is passed to easyenclave at boot via **GCE instance metadata** (`ee-config` attribute), read by `easyenclave::init::fetch_gce_metadata_config()` and applied as env vars. `scripts/gcp-deploy.sh` builds the spec and invokes `gcloud compute instances create --image-family=easyenclave-staging --metadata-from-file=ee-config=...`. +Per-VM configuration (CF credentials, GitHub OAuth, the workload spec itself) is passed to easyenclave at boot via **GCE instance metadata** (`ee-config` attribute), read by `easyenclave::init::fetch_gce_metadata_config()` and applied as env vars. The CP-deploy step in `.github/workflows/deploy-cp.yml` builds the spec and invokes `gcloud compute instances create --image-family=easyenclave-staging --metadata-from-file=ee-config=...`. ## CI/CD @@ -48,16 +48,18 @@ PR → pre-release tagged pr-{sha12}, then ephemeral preview at pr- branch deleted → pr-teardown.yml deletes the preview's VM, CF tunnel, and DNS push to main → rolling `latest` release, then auto-deploy to production push v* tag → versioned release (no auto-deploy) -manual → production-deploy.yml promotes any existing tag +manual dispatch → redeploy any existing tag to production (rollback tool) ``` -Each PR gets its own isolated env at `pr-{N}.{domain}` with `DD_ENV=pr-{N}` — no more shared staging tier. `.github/workflows/release.yml` builds the static musl binary, publishes it as a GitHub release asset, deploys the PR's preview VM, and posts the URL back to the PR. The preview VM is verified via: +Every path lives in `.github/workflows/release.yml`: one `build` job, then either `deploy-preview` (PR) or `deploy-production` (main / dispatch), both calling the reusable `deploy-cp.yml` with env-specific inputs. Each cascades into a relaunch of the matching `dd-local-{env}` VM on the tdx2 host — the Release run only goes green when that agent re-registers with the freshly-deployed CP. Verifications along the way: 1. `/health` via the Cloudflare tunnel 2. `/cp/attest` returning a real TDX MRTD (cryptographic proof the freshly-deployed VM is running — old VMs don't have the endpoint and return 404) -3. No other `dd-pr-{N}-*` VM is RUNNING after deploy (STONITH must have halted the previous instance of this PR) +3. Dashboard `/` returning HTTP 200 under a Bearer PAT +4. No other `dd-{env}-*` VM is RUNNING after deploy (STONITH must have halted the previous instance) +5. `dd-local-{env}` re-registers with the new CP within 5 min -Browser access to a PR preview goes through `/auth/pat` (paste a GitHub PAT, validated against `DD_OWNER`). OAuth is only wired for production, which `production-deploy.yml` still targets at `app.{domain}`. +Browser access to a PR preview goes through `/auth/pat` (paste a GitHub PAT, validated against `DD_OWNER`). OAuth is only wired for production, at `app.{domain}`. ## STONITH diff --git a/apps/_infra/dd-relaunch.sh b/apps/_infra/dd-relaunch.sh new file mode 100755 index 0000000..55d380d --- /dev/null +++ b/apps/_infra/dd-relaunch.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +# dd-relaunch.sh — destroy and recreate one local TDX agent VM. +# +# Invoked over SSH by .github/actions/relaunch-agent during a Release +# cascade. Pulls the PR's (or main's) apps/_infra tree so this script +# and local-agents.sh are always the ones the caller authored. Tears +# down the existing VM + overlay, runs local-agents.sh to redefine, +# and starts the VM. +# +# dd-relaunch.sh prod https://app.devopsdefender.com main +# dd-relaunch.sh preview https://pr-N.devopsdefender.com feat/some-pr +# +# DD_PAT and DD_ITA_API_KEY must be set in the environment. + +set -euo pipefail + +KIND="${1?usage: dd-relaunch.sh [ref]}" +CP="${2?cp url required}" +REF="${3:-main}" +: "${DD_PAT?DD_PAT must be set}" +: "${DD_ITA_API_KEY?DD_ITA_API_KEY must be set}" + +case "$KIND" in + prod|preview) ;; + *) echo "unknown kind: $KIND (want prod|preview)" >&2; exit 2 ;; +esac + +cd /home/tdx2/src/dd + +# Refresh the infra scripts + apps/ tree from the caller's ref. Limited +# checkout so a dirty working tree elsewhere doesn't block the deploy. +# This script is already in memory, so the refresh takes effect on the +# *next* invocation. +git fetch --quiet origin "$REF" +git checkout --quiet "origin/$REF" -- apps/ +echo "dd-relaunch: refreshed apps/ from origin/$REF" + +vm="dd-local-$KIND" +overlay="/var/lib/libvirt/images/$vm.qcow2" + +virsh destroy "$vm" 2>/dev/null || true +virsh undefine "$vm" --managed-save --snapshots-metadata 2>/dev/null || true +rm -f "$overlay" + +# Redefine via local-agents.sh; "" skips the other slot. +case "$KIND" in + prod) ./apps/_infra/local-agents.sh "" "$CP" ;; + preview) ./apps/_infra/local-agents.sh "$CP" "" ;; +esac + +virsh start "$vm" +echo "relaunched $vm against $CP" diff --git a/scripts/local-agents.sh b/apps/_infra/local-agents.sh similarity index 97% rename from scripts/local-agents.sh rename to apps/_infra/local-agents.sh index 341cc9e..20b772a 100755 --- a/scripts/local-agents.sh +++ b/apps/_infra/local-agents.sh @@ -12,11 +12,11 @@ # Usage: # export DD_PAT="$(gh auth token)" # export DD_ITA_API_KEY="$(cat ~/.secrets/ita_api_key)" -# ./scripts/local-agents.sh https://pr-106.devopsdefender.com https://app.devopsdefender.com +# ./apps/_infra/local-agents.sh https://pr-106.devopsdefender.com https://app.devopsdefender.com # # Pass "" for either URL to skip defining that VM: -# ./scripts/local-agents.sh "" https://app.devopsdefender.com # prod only -# ./scripts/local-agents.sh https://pr-N.devopsdefender.com "" # preview only +# ./apps/_infra/local-agents.sh "" https://app.devopsdefender.com # prod only +# ./apps/_infra/local-agents.sh https://pr-N.devopsdefender.com "" # preview only # # After: virsh start dd-local-preview && virsh start dd-local-prod diff --git a/scripts/dd-relaunch.sh b/scripts/dd-relaunch.sh deleted file mode 100755 index a118618..0000000 --- a/scripts/dd-relaunch.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -# dd-relaunch.sh — destroy and recreate one local TDX agent VM. -# -# Invoked over SSH by .github/workflows/local-agents.yml after a -# Release / Production Deploy succeeds. Pulls the current main of dd -# (so this script and local-agents.sh are always the latest), tears -# down the existing VM + overlay, runs scripts/local-agents.sh to -# redefine, and starts the VM. -# -# dd-relaunch.sh prod https://app.devopsdefender.com -# dd-relaunch.sh preview https://pr-N.devopsdefender.com -# -# DD_PAT and DD_ITA_API_KEY must be set in the environment. - -set -euo pipefail - -KIND="${1?usage: dd-relaunch.sh }" -CP="${2?cp url required}" -REF="${3:-main}" -: "${DD_PAT?DD_PAT must be set}" -: "${DD_ITA_API_KEY?DD_ITA_API_KEY must be set}" - -case "$KIND" in - prod|preview) ;; - *) echo "unknown kind: $KIND (want prod|preview)" >&2; exit 2 ;; -esac - -cd /home/tdx2/src/dd - -# Pull the latest scripts. Limit the checkout to the two scripts so a -# dirty working tree elsewhere doesn't block the deploy. The relaunch -# script itself has already been read into memory by bash, so the -# update takes effect on the *next* invocation. -git fetch --quiet origin "$REF" -git checkout --quiet "origin/$REF" -- scripts/local-agents.sh scripts/dd-relaunch.sh -git checkout --quiet "origin/$REF" -- scripts/workloads.sh 2>/dev/null || true -git checkout --quiet "origin/$REF" -- apps/ 2>/dev/null || true -echo "dd-relaunch: refreshed scripts + apps/ from origin/$REF" - -vm="dd-local-$KIND" -overlay="/var/lib/libvirt/images/$vm.qcow2" - -virsh destroy "$vm" 2>/dev/null || true -virsh undefine "$vm" --managed-save --snapshots-metadata 2>/dev/null || true -rm -f "$overlay" - -# Redefine via local-agents.sh; "" skips the other slot. -case "$KIND" in - prod) ./scripts/local-agents.sh "" "$CP" ;; - preview) ./scripts/local-agents.sh "$CP" "" ;; -esac - -virsh start "$vm" -echo "relaunched $vm against $CP" - -# ollama deploy + pull + query is driven from the workflow's HTTPS step -# on ubuntu-latest, not here — see .github/workflows/local-agents.yml. diff --git a/scripts/gcp-deploy.sh b/scripts/gcp-deploy.sh deleted file mode 100755 index a65a378..0000000 --- a/scripts/gcp-deploy.sh +++ /dev/null @@ -1,148 +0,0 @@ -#!/bin/bash -# gcp-deploy.sh — Create a TDX management VM on GCP that boots from a -# sealed easyenclave image and runs dd management as a native process. -# -# Both the devopsdefender binary and cloudflared are fetched straight -# from their GitHub releases by easyenclave's github_release workload -# source — no OCI registry, no Dockerfile. Cloudflared is a fetch-only -# boot workload: its binary lands in /var/lib/easyenclave/bin (now on -# PATH) so dd-register can shell out to `cloudflared` by name. -# -# Agent-side mirror: a local TDX guest with a vfio-pci-passed GPU can -# register against the CP this script deploys by using the same -# easyenclave `github_release` workload source for the devopsdefender -# binary, with `DD_REGISTER_URL=wss://{hostname}/register`. See the -# local-GPU demo notes in the commit trail. -# -# Called by .github/workflows/{staging,production}-deploy.yml. Requires -# gcloud CLI authenticated via Workload Identity Federation. -# -# Required env vars (set by the workflow): -# GCP_PROJECT_ID — GCP project where the VM lives -# GCP_ZONE — GCP zone (e.g. us-central1-c) -# DD_ENV — staging, production, or pr-{num} (ephemeral per-PR) -# DD_DOMAIN — Public domain (e.g. devopsdefender.com) -# CLOUDFLARE_API_TOKEN — CF API token (dd-register uses it) -# CLOUDFLARE_ACCOUNT_ID — CF account ID -# CLOUDFLARE_ZONE_ID — CF zone ID -# -# Optional env vars: -# DD_HOSTNAME — public hostname override. If unset, derived -# from DD_ENV (production → app.$DOMAIN, -# anything else → app-staging.$DOMAIN). Set -# explicitly for per-PR envs (pr-42.$DOMAIN). -# DD_GITHUB_CLIENT_ID — GitHub OAuth client ID. If unset, dd-web -# disables OAuth login and only PAT auth works. -# Per-PR envs leave this unset. -# DD_GITHUB_CLIENT_SECRET — GitHub OAuth client secret (paired with above) -# DD_GITHUB_CALLBACK_URL — OAuth callback, default https://{hostname}/auth/github/callback -# EE_IMAGE_FAMILY — easyenclave GCP image family -# EE_IMAGE_PROJECT — project hosting the image -# DD_RELEASE_TAG — GitHub release tag on devopsdefender/dd -# (defaults to 'latest'; PRs override with pr-{sha12}) -# VM_MACHINE_TYPE — default c3-standard-4 -# VM_DISK_SIZE — default 10GB - -set -euo pipefail - -# ── easyenclave image family ────────────────────────────────────────────── -# easyenclave-staging → rolling main, rotates on every push (5 kept) -# easyenclave-stable → v* tags, kept forever -EE_IMAGE_FAMILY="${EE_IMAGE_FAMILY:-easyenclave-staging}" -EE_IMAGE_PROJECT="${EE_IMAGE_PROJECT:-easyenclave}" -DD_RELEASE_TAG="${DD_RELEASE_TAG:-latest}" - -VM_NAME="dd-${DD_ENV}-$(date +%s)" -VM_MACHINE_TYPE="${VM_MACHINE_TYPE:-c3-standard-4}" -VM_DISK_SIZE="${VM_DISK_SIZE:-10GB}" - -if [ -z "${DD_HOSTNAME:-}" ]; then - if [ "${DD_ENV}" = "production" ]; then - DD_HOSTNAME="app.${DD_DOMAIN}" - else - DD_HOSTNAME="app-staging.${DD_DOMAIN}" - fi -fi -DD_GITHUB_CLIENT_ID="${DD_GITHUB_CLIENT_ID:-}" -DD_GITHUB_CLIENT_SECRET="${DD_GITHUB_CLIENT_SECRET:-}" -DD_GITHUB_CALLBACK_URL="${DD_GITHUB_CALLBACK_URL:-https://${DD_HOSTNAME}/auth/github/callback}" - -# Intel Trust Authority — mandatory. DD_ITA_API_KEY must be set in the -# workflow (from secrets.DD_ITA_API_KEY). The CP will refuse to start -# without one. Everything else has a default. -if [ -z "${DD_ITA_API_KEY:-}" ]; then - echo "DD_ITA_API_KEY is required (configure secrets.DD_ITA_API_KEY)" >&2 - exit 1 -fi -DD_ITA_BASE_URL="${DD_ITA_BASE_URL:-https://api.trustauthority.intel.com}" -DD_ITA_JWKS_URL="${DD_ITA_JWKS_URL:-https://portal.trustauthority.intel.com/certs}" -DD_ITA_ISSUER="${DD_ITA_ISSUER:-https://portal.trustauthority.intel.com}" - -# ── Build the workload spec ────────────────────────────────────────────── -# Boot workloads come from apps//workload.{json,json.tmpl}. Same -# file per workload whether this CP runs in prod, staging, or a PR -# preview; only the env-var substitutions differ. -# -# cloudflared — fetch-only, puts the binary on PATH for DD to spawn. -# dd-management — devopsdefender in DD_MODE=management (CP + dashboard). -# -# Empty ${DD_GITHUB_CLIENT_ID} etc produce empty "KEY=" strings; the -# bake helper strips those so the resulting spec matches the old -# `if $gh_client_id == "" then [] else [...]` conditional. -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" -# shellcheck source=./workloads.sh -source "$SCRIPT_DIR/workloads.sh" -EE_BOOT_WORKLOADS=$( - DD_RELEASE_TAG="$DD_RELEASE_TAG" \ - CLOUDFLARE_API_TOKEN="$CLOUDFLARE_API_TOKEN" \ - CLOUDFLARE_ACCOUNT_ID="$CLOUDFLARE_ACCOUNT_ID" \ - CLOUDFLARE_ZONE_ID="$CLOUDFLARE_ZONE_ID" \ - DD_DOMAIN="$DD_DOMAIN" \ - DD_HOSTNAME="$DD_HOSTNAME" \ - DD_ENV="$DD_ENV" \ - DD_GITHUB_CLIENT_ID="$DD_GITHUB_CLIENT_ID" \ - DD_GITHUB_CLIENT_SECRET="$DD_GITHUB_CLIENT_SECRET" \ - DD_GITHUB_CALLBACK_URL="$DD_GITHUB_CALLBACK_URL" \ - DD_ITA_API_KEY="$DD_ITA_API_KEY" \ - DD_ITA_BASE_URL="$DD_ITA_BASE_URL" \ - DD_ITA_JWKS_URL="$DD_ITA_JWKS_URL" \ - DD_ITA_ISSUER="$DD_ITA_ISSUER" \ - join \ - "$REPO_ROOT/apps/cloudflared/workload.json" \ - "$REPO_ROOT/apps/dd-management/workload.json.tmpl" -) -# ollama + openclaw are NOT baked into the CP preview. EE's tmpfs -# /var/lib/easyenclave is too small for the 900 MB container image, -# and attaching a scratch PD here would duplicate what the local -# dd-local-preview VM already provides via its vdc ext4 disk. The -# preview CP stays slim; the ollama+openclaw demo registers from -# dd-local-preview (scripts/local-agents.sh). - -# ── Wrap into ee-config ─────────────────────────────────────────────────── -jq -c -n \ - --arg workloads "$EE_BOOT_WORKLOADS" \ - '{ "EE_BOOT_WORKLOADS": $workloads, "EE_OWNER": "devopsdefender" }' \ - > /tmp/ee-config.json - -trap 'rm -f /tmp/ee-config.json' EXIT - -# ── Create the VM ───────────────────────────────────────────────────────── -gcloud compute instances create "$VM_NAME" \ - --project="$GCP_PROJECT_ID" \ - --zone="$GCP_ZONE" \ - --machine-type="$VM_MACHINE_TYPE" \ - --confidential-compute-type=TDX \ - --maintenance-policy=TERMINATE \ - --boot-disk-size="$VM_DISK_SIZE" \ - --image-family="$EE_IMAGE_FAMILY" \ - --image-project="$EE_IMAGE_PROJECT" \ - --metadata-from-file=ee-config=/tmp/ee-config.json \ - --labels=devopsdefender=managed,dd_env="${DD_ENV}" \ - --tags=dd-management - -echo "VM: $VM_NAME" -echo " image: family $EE_IMAGE_FAMILY ($EE_IMAGE_PROJECT)" -echo " hostname: $DD_HOSTNAME" -echo " dd release: $DD_RELEASE_TAG" -echo " workload: dd management" diff --git a/scripts/redeploy-workload.sh b/scripts/redeploy-workload.sh deleted file mode 100755 index e866d3f..0000000 --- a/scripts/redeploy-workload.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# redeploy-workload.sh — POST one baked workload spec to a live agent's -# /deploy endpoint. Handy for iterating on apps//workload.json -# without rebuilding the whole config.iso + restarting the VM. -# -# Usage: -# redeploy-workload.sh -# -# Example: -# DD_PAT=$(gh auth token) \ -# ./scripts/redeploy-workload.sh \ -# https://app.devopsdefender.com \ -# dd-local-prod \ -# apps/openclaw/workload.json.tmpl -# -# Requires DD_PAT in env. Template envs (MODEL, DD_CP_URL, …) must -# also be exported if the referenced workload file is a .tmpl. - -set -euo pipefail - -CP_URL="${1?usage: redeploy-workload.sh }" -VM_NAME="${2?vm_name required (e.g. dd-local-prod)}" -APP_PATH="${3?app_path required (e.g. apps/openclaw/workload.json.tmpl)}" -: "${DD_PAT?set DD_PAT (e.g. DD_PAT=\$(gh auth token))}" - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=./workloads.sh -source "$SCRIPT_DIR/workloads.sh" - -AUTH=(-H "Authorization: Bearer $DD_PAT") - -# Discover the agent's tunnel hostname via CP's fleet API. -agent_host=$( - curl -fsS "${AUTH[@]}" "$CP_URL/api/agents" 2>/dev/null \ - | jq -r --arg vm "$VM_NAME" ' - [.[] | select(.vm_name==$vm and .status=="healthy")] - | sort_by(.last_seen) | reverse | .[0].hostname // empty' -) -if [ -z "$agent_host" ] || [ "$agent_host" = "null" ]; then - echo "ERROR: no healthy $VM_NAME in $CP_URL/api/agents" >&2 - exit 1 -fi -echo "agent: https://$agent_host" - -spec=$(bake "$APP_PATH") -echo "redeploying $(echo "$spec" | jq -r .app_name)..." -curl -fsS --max-time 60 "${AUTH[@]}" \ - "https://$agent_host/deploy" \ - -H 'Content-Type: application/json' \ - -d "$spec" | jq -c . diff --git a/scripts/workloads.sh b/scripts/workloads.sh deleted file mode 100755 index c41fe47..0000000 --- a/scripts/workloads.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash -# workloads.sh — shared helpers for assembling EE workload specs. -# -# A DD "workload" is a JSON object with {app_name, github_release, -# cmd, env} that EE's DeployRequest consumes. Each app lives in -# apps//workload.json (literal) or apps//workload.json.tmpl -# (with ${VAR} placeholders substituted at bake time from the caller's -# environment). -# -# Public functions: -# bake — print one rendered workload to stdout. -# Plain .json is emitted as-is; .json.tmpl -# gets envsubst + empty-env-entry stripping. -# join [path…] — print a JSON array of rendered workloads. -# -# Sourced from scripts/local-agents.sh and scripts/gcp-deploy.sh so -# both scripts share one source of truth for the workload shape. - -# Render a single workload file. -# For .json files, passthrough. -# For .json.tmpl files, substitute ${VAR} from the current env, then -# remove any "KEY=" env array entries that ended up with an empty -# value (matches the conditional-include pattern gcp-deploy.sh used -# for DD_GITHUB_CLIENT_ID & co). -bake() { - local path="$1" - if [[ "$path" == *.json ]]; then - jq -c . "$path" - elif [[ "$path" == *.json.tmpl ]]; then - envsubst < "$path" \ - | jq -c 'if .env then .env |= map(select(. | test("^[^=]+=.+"))) else . end' - else - echo "workloads.sh: unknown workload file type: $path" >&2 - return 1 - fi -} - -# Print a JSON array of rendered workloads. -join() { - local out="[" - local first=1 - for p in "$@"; do - local rendered - rendered=$(bake "$p") || return 1 - if [ $first -eq 1 ]; then - out+="$rendered" - first=0 - else - out+=",$rendered" - fi - done - out+="]" - echo "$out" -}