diff --git a/.github/actions/relaunch-agent/action.yml b/.github/actions/relaunch-agent/action.yml new file mode 100644 index 0000000..a58a048 --- /dev/null +++ b/.github/actions/relaunch-agent/action.yml @@ -0,0 +1,71 @@ +name: Relaunch local TDX agent +description: >- + SSH into the tdx2 host and recreate the matching dd-local-{kind} libvirt + domain against the given CP url, pulling scripts from the given git ref. + Shared between Local Agents (push/PR/dispatch) and Deploy CP (cascading + relaunch after a successful CP deploy). + +inputs: + kind: + description: 'prod | preview — which libvirt domain to relaunch' + required: true + url: + description: 'CP URL the agent should register against (e.g. https://app.devopsdefender.com)' + required: true + ref: + description: 'git ref whose scripts/apps tree dd-relaunch.sh should check out on the host' + required: true + ssh-key: + description: 'Private SSH key for tdx2@host' + required: true + host: + description: 'Public host address of the tdx2 node' + required: true + dd-pat: + description: 'GitHub PAT the agent uses to talk to the CP' + required: true + ita-api-key: + description: 'Intel Trust Authority API key for attestation' + required: true + +runs: + using: composite + steps: + # CP must be reachable before we SSH — on PR pushes we race with + # Release's deploy-preview standing up the pr-N CP. /health is public. + - name: Wait for CP to be healthy + shell: bash + env: + URL: ${{ inputs.url }} + run: | + for i in $(seq 1 60); do + if curl -fsS --max-time 5 "$URL/health" >/dev/null 2>&1; then + echo "CP $URL healthy after ${i} attempts" + exit 0 + fi + echo " waiting for $URL... (${i}/60)" + sleep 10 + done + echo "::error::CP $URL never came up within 10 min" + exit 1 + + # SSH in and relaunch the VM (destroy + redefine + start). Finishes + # in ~10 s — the baked config.iso's EE_BOOT_WORKLOADS drives the rest. + - name: ssh + relaunch VM + shell: bash + env: + SSH_KEY: ${{ inputs.ssh-key }} + HOST: ${{ inputs.host }} + DD_PAT: ${{ inputs.dd-pat }} + DD_ITA_API_KEY: ${{ inputs.ita-api-key }} + KIND: ${{ inputs.kind }} + URL: ${{ inputs.url }} + REF: ${{ inputs.ref }} + run: | + mkdir -p ~/.ssh + printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null + ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \ + -i ~/.ssh/id_ed25519 "tdx2@$HOST" \ + "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL' '$REF'" diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml new file mode 100644 index 0000000..a6ab9e7 --- /dev/null +++ b/.github/workflows/deploy-cp.yml @@ -0,0 +1,307 @@ +name: Deploy CP + +# Reusable workflow: provision the CP TDX VM on GCP, wait for it to be +# healthy, verify attestation + dashboard + STONITH, and cascade a +# relaunch of the matching dd-local agent VM. Called from release.yml +# (preview path) and production-deploy.yml (prod path) with different +# inputs — both paths share this exact set of verification steps, so +# preview CI exercises the same code that prod runs. +# +# GitHub Actions allows ≤4 levels of workflow_call nesting. Today's +# chain is `release.yml → deploy-cp.yml` (2) and +# `production-deploy.yml → deploy-cp.yml` (2) — deep enough headroom +# that we can still call one more reusable workflow below us if needed. +# The agent-relaunch cascade uses a composite action (same-job, no +# nesting) to keep that headroom. + +on: + workflow_call: + inputs: + env: + description: 'DD_ENV (e.g. "production", "pr-42")' + required: true + type: string + hostname: + description: 'Public hostname (e.g. app.devopsdefender.com)' + required: true + type: string + gcp_environment: + description: 'GitHub environment name — "production" | "staging"' + required: true + type: string + workload_identity_provider: + description: 'GCP Workload Identity Federation provider resource name' + required: true + type: string + service_account: + description: 'GCP service account email' + required: true + type: string + release_tag: + description: 'devopsdefender release tag to deploy (e.g. "latest", "pr-abc123")' + required: true + type: string + oauth_enabled: + description: 'Enable GitHub OAuth (prod only; previews use PAT)' + required: false + type: boolean + default: false + comment_on_pr: + description: 'Leave a PR comment with the preview URL' + required: false + type: boolean + default: false + relaunch_agent: + description: 'After CP deploy, cascade a relaunch of dd-local-{env} via SSH' + required: false + type: boolean + default: true + ref: + description: 'Git ref the tdx2 host should pull before relaunching the agent VM' + required: false + type: string + default: main + +concurrency: + group: deploy-cp-${{ inputs.env }} + cancel-in-progress: false + +jobs: + deploy: + runs-on: ubuntu-latest + environment: ${{ inputs.gcp_environment }} + permissions: + contents: read + id-token: write + pull-requests: write + env: + DD_ENV: ${{ inputs.env }} + DD_HOSTNAME: ${{ inputs.hostname }} + GCP_ZONE: us-central1-c + steps: + - uses: actions/checkout@v4 + + - uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ inputs.workload_identity_provider }} + service_account: ${{ inputs.service_account }} + - uses: google-github-actions/setup-gcloud@v2 + + - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} + CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} + CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} + # OAuth only in environments that have these set (production). + # When empty, gcp-deploy.sh omits the workload env vars → + # dd-web disables /auth/github/* and serves /auth/pat only. + DD_GITHUB_CLIENT_ID: ${{ inputs.oauth_enabled && (vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID) || '' }} + DD_GITHUB_CALLBACK_URL: ${{ inputs.oauth_enabled && vars.DD_GITHUB_CALLBACK_URL || '' }} + DD_GITHUB_CLIENT_SECRET: ${{ inputs.oauth_enabled && secrets.DD_GITHUB_CLIENT_SECRET || '' }} + # ITA — optional. When set, the CP mints + verifies quotes. + DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} + DD_RELEASE_TAG: ${{ inputs.release_tag }} + run: scripts/gcp-deploy.sh + + - name: Wait for agent health (streams serial console) + env: + AGENT_URL: https://${{ inputs.hostname }} + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + VM_NAME=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + if [ -z "$VM_NAME" ]; then + echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed" + exit 1 + fi + echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)" + + LAST_LINES=0 + for i in $(seq 1 60); do + # Stream serial console so boot failures (DHCP hang, release + # fetch error, cloudflared exit, etc.) are visible without + # shelling into GCP. + gcloud compute instances get-serial-port-output "$VM_NAME" \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \ + > /tmp/serial.log || true + TOTAL_LINES=$(wc -l < /tmp/serial.log) + if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then + tail -n +$((LAST_LINES + 1)) /tmp/serial.log \ + | sed 's/^/[serial] /' + LAST_LINES=$TOTAL_LINES + fi + + if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then + echo "::error::boot failed — serial log shows fatal pattern" + exit 1 + fi + + if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then + echo "Agent healthy at ${AGENT_URL}" + exit 0 + fi + echo " waiting for tunnel... (${i}/60)" + sleep 5 + done + echo "::error::Agent not healthy within 5 minutes" + echo "--- final serial tail ---" + tail -80 /tmp/serial.log | sed 's/^/[serial] /' + exit 1 + + - name: Verify NEW VM via TDX attestation + env: + AGENT_URL: https://${{ inputs.hostname }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # /cp/attest proves the freshly-deployed VM is serving the tunnel + # (stale tunnels point at old VMs that 404 on this endpoint). + # MRTD = 48 bytes at offset 184 in TDX quote v4; if non-zero, + # attestation actually worked. + NONCE=$(openssl rand -base64 16) + for attempt in $(seq 1 60); do + BODY=$(curl -sG -w '\n%{http_code}' \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + --data-urlencode "nonce=${NONCE}" \ + "${AGENT_URL}/cp/attest" || echo $'\n000') + CODE=$(echo "$BODY" | tail -n1) + JSON=$(echo "$BODY" | sed '$d') + if [ "$CODE" = "200" ]; then + QUOTE_B64=$(echo "$JSON" | jq -r '.quote_b64 // empty') + if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then + MRTD=$(echo "$QUOTE_B64" | base64 -d \ + | dd bs=1 skip=184 count=48 status=none | xxd -p -c 48) + if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then + echo "NEW VM verified — MRTD: $MRTD" + exit 0 + fi + echo " /cp/attest 200 but MRTD empty/zero, retrying... (${attempt}/60)" + else + echo " /cp/attest 200 but no quote_b64, retrying... (${attempt}/60)" + fi + else + echo " /cp/attest returned HTTP ${CODE}, retrying... (${attempt}/60)" + fi + sleep 10 + done + echo "::error::/cp/attest never returned a valid quote — stale tunnel or new VM never came up" + exit 1 + + - name: Verify dashboard renders + env: + AGENT_URL: https://${{ inputs.hostname }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Fast sanity check on top of /cp/attest — proves dd-web is up + # and accepts the CI PAT's Bearer auth. + for attempt in $(seq 1 12); do + code=$(curl -s -o /dev/null -w '%{http_code}' \ + -H "Authorization: Bearer ${GITHUB_TOKEN}" \ + "${AGENT_URL}/" || echo 000) + if [ "$code" = "200" ]; then + echo "Dashboard renders (HTTP 200, attempt ${attempt})" + exit 0 + fi + echo " dashboard returned HTTP ${code}, retrying... (${attempt}/12)" + sleep 5 + done + echo "::error::dashboard / never returned 200 (last HTTP ${code})" + exit 1 + + - name: Verify STONITH halted prior VM(s) in this env + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + # dd-register STONITHs the old VM on startup by deleting its + # CF tunnel → old cloudflared exits → old dd-register poweroffs. + # Scoped to this env — per-PR previews are hostname-isolated, + # so this only reaps prior deploys of the same env. + NEW_VM=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ + --format="value(name)" --sort-by=~creationTimestamp | head -1) + echo "new VM: $NEW_VM" + SURVIVORS="" + for i in $(seq 1 24); do + SURVIVORS=$(gcloud compute instances list \ + --project="$GCP_PROJECT_ID" \ + --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ + --format="value(name)" \ + | grep -vx "$NEW_VM" || true) + if [ -z "$SURVIVORS" ]; then + echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" + exit 0 + fi + echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" + echo " waiting for STONITH poweroff... (${i}/24)" + sleep 5 + done + echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" + echo "$SURVIVORS" + # shellcheck disable=SC2086 + gcloud compute instances delete $SURVIVORS \ + --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true + echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM" + + - name: Comment preview URL on PR + if: inputs.comment_on_pr && github.event_name == 'pull_request' + uses: actions/github-script@v7 + with: + script: | + const url = `https://${{ inputs.hostname }}`; + const body = [ + `### DD preview ready`, + ``, + `**URL:** ${url}`, + ``, + `Browser login: paste \`gh auth token\` output at ${url}/auth/pat`, + ``, + `CLI / curl: \`curl -H "Authorization: Bearer $(gh auth token)" ${url}/\``, + ``, + `Register endpoint for a local agent: \`wss://${{ inputs.hostname }}/register\``, + ].join('\n'); + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + const marker = '### DD preview ready'; + const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker)); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } + + # Cascade a relaunch of the matching dd-local-{env} libvirt domain + # on the tdx2 host. Preview runs dd-local-preview against the PR's + # CP; prod runs dd-local-prod against app.devopsdefender.com. + # Non-blocking (`continue-on-error`) because the openclaw boot + # chain inside dd-local-preview can take 30 min on first boot — + # we want PR status reflecting the CP deploy, with the agent + # relaunch as a signal-only exercise until vdc is warm. + - name: Relaunch dd-local-${{ inputs.env == 'production' && 'prod' || 'preview' }} + if: inputs.relaunch_agent + continue-on-error: true + uses: ./.github/actions/relaunch-agent + with: + kind: ${{ inputs.env == 'production' && 'prod' || 'preview' }} + url: https://${{ inputs.hostname }} + ref: ${{ inputs.ref }} + ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} + host: ${{ secrets.DD_LOCAL_HOST }} + dd-pat: ${{ secrets.GITHUB_TOKEN }} + ita-api-key: ${{ secrets.DD_ITA_API_KEY }} diff --git a/.github/workflows/local-agents.yml b/.github/workflows/local-agents.yml index 345dbc3..9a32b46 100644 --- a/.github/workflows/local-agents.yml +++ b/.github/workflows/local-agents.yml @@ -1,24 +1,14 @@ name: Local Agents -# Relaunches the local TDX agent VM on this user's host whenever the -# corresponding CP gets new code: -# - Production Deploy success → reboot dd-local-prod against app.devopsdefender.com -# - Release success on a PR → reboot dd-local-preview against pr-N.devopsdefender.com -# -# SSHs in via key auth to a public-IP host, then invokes -# scripts/dd-relaunch.sh which handles the destroy/recreate cycle. +# Manual entry point for relaunching one of the local dd-local-{kind} +# libvirt domains on the tdx2 host. The everyday path (prod redeploy, +# preview PR push) now goes through deploy-cp.yml, which calls the +# relaunch-agent composite action directly after a successful CP +# deploy — so this workflow only exists for operator-driven one-shots: +# iterating on scripts/dd-relaunch.sh, re-running a relaunch without +# re-deploying the CP, etc. on: - workflow_run: - workflows: ["Release", "Production Deploy"] - types: [completed] - # Every non-README push to main also fires a prod relaunch directly, - # so fixes to the relaunch / deploy scripts get exercised even when - # they don't cascade through Release → Production Deploy. - push: - branches: [main] - paths-ignore: - - "README.md" workflow_dispatch: inputs: kind: @@ -29,83 +19,29 @@ on: description: 'CP URL (e.g. https://app.devopsdefender.com)' required: true default: 'https://app.devopsdefender.com' + ref: + description: 'git ref whose scripts/apps tree to check out on the host' + required: true + default: 'main' permissions: contents: read - pull-requests: read concurrency: - group: local-agents-${{ github.event.workflow_run.name || github.event.inputs.kind }} + group: local-agents-${{ github.event.inputs.kind }} cancel-in-progress: false jobs: relaunch: - if: | - github.event_name == 'workflow_dispatch' - || github.event_name == 'push' - || github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - id: pick - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - EVENT: ${{ github.event_name }} - WF: ${{ github.event.workflow_run.name }} - BRANCH: ${{ github.event.workflow_run.head_branch }} - DISPATCH_KIND: ${{ github.event.inputs.kind }} - DISPATCH_URL: ${{ github.event.inputs.cp_url }} - run: | - if [ "$EVENT" = "workflow_dispatch" ]; then - echo "kind=$DISPATCH_KIND" >> "$GITHUB_OUTPUT" - echo "url=$DISPATCH_URL" >> "$GITHUB_OUTPUT" - elif [ "$EVENT" = "push" ] || [ "$WF" = "Production Deploy" ]; then - # push-to-main on local-agent scripts, or a prod CP redeploy - # → relaunch dd-local-prod against the live prod CP. - echo "kind=prod" >> "$GITHUB_OUTPUT" - echo "url=https://app.devopsdefender.com" >> "$GITHUB_OUTPUT" - else - # Release on a PR: derive pr-N. Released-on-main returns - # no open PR → skip (Production Deploy will fire shortly). - pr=$(gh pr list --head "$BRANCH" --state open \ - --repo "${{ github.repository }}" \ - --json number --jq '.[0].number' 2>/dev/null || true) - if [ -n "$pr" ]; then - echo "kind=preview" >> "$GITHUB_OUTPUT" - echo "url=https://pr-$pr.devopsdefender.com" >> "$GITHUB_OUTPUT" - else - echo "kind=skip" >> "$GITHUB_OUTPUT" - fi - fi - - # Step 1: SSH in and relaunch the VM (destroy + redefine + start). - # Finishes in ~10 s — doesn't need keepalives. Only does the - # libvirt operations that require host-level access. - - name: ssh + relaunch VM - if: steps.pick.outputs.kind != 'skip' - env: - SSH_KEY: ${{ secrets.DD_LOCAL_SSH_KEY }} - HOST: ${{ secrets.DD_LOCAL_HOST }} - DD_PAT: ${{ secrets.GITHUB_TOKEN }} - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - KIND: ${{ steps.pick.outputs.kind }} - URL: ${{ steps.pick.outputs.url }} - run: | - mkdir -p ~/.ssh - printf '%s\n' "$SSH_KEY" > ~/.ssh/id_ed25519 - chmod 600 ~/.ssh/id_ed25519 - ssh-keyscan -H "$HOST" >> ~/.ssh/known_hosts 2>/dev/null - ssh -o BatchMode=yes -o StrictHostKeyChecking=yes \ - -i ~/.ssh/id_ed25519 "tdx2@$HOST" \ - "DD_PAT='$DD_PAT' DD_ITA_API_KEY='$DD_ITA_API_KEY' /home/tdx2/src/dd/scripts/dd-relaunch.sh '$KIND' '$URL'" - - # Step 2: Deploy ollama / pull model / sample query. Pure HTTPS - # against the CP + the newly-registered agent's tunnel. Can take - # minutes (model pull) — no SSH to keep alive. - - name: deploy ollama (HTTPS) - if: steps.pick.outputs.kind != 'skip' - env: - DD_PAT: ${{ secrets.GITHUB_TOKEN }} - KIND: ${{ steps.pick.outputs.kind }} - URL: ${{ steps.pick.outputs.url }} - run: ./scripts/ollama-deploy.sh "$KIND" "$URL" + - uses: ./.github/actions/relaunch-agent + with: + kind: ${{ github.event.inputs.kind }} + url: ${{ github.event.inputs.cp_url }} + ref: ${{ github.event.inputs.ref }} + ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} + host: ${{ secrets.DD_LOCAL_HOST }} + dd-pat: ${{ secrets.GITHUB_TOKEN }} + ita-api-key: ${{ secrets.DD_ITA_API_KEY }} diff --git a/.github/workflows/production-deploy.yml b/.github/workflows/production-deploy.yml index 8253179..f804e02 100644 --- a/.github/workflows/production-deploy.yml +++ b/.github/workflows/production-deploy.yml @@ -7,6 +7,9 @@ name: Production Deploy # we don't promote. # - workflow_dispatch: manual re-deploy of any existing tag (e.g. a # known-good v0.2.0 after a bad main push). +# +# Body lives in deploy-cp.yml — same workflow PR previews use, so every +# PR exercises the prod deploy path before it lands here. on: workflow_run: @@ -20,127 +23,31 @@ on: required: false default: 'latest' -concurrency: - group: dd-production - cancel-in-progress: false - -env: - GCP_ZONE: us-central1-c - DD_ENV: production - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - permissions: contents: read jobs: - # dd-register STONITHs the old VM on startup by deleting its CF - # tunnel, so no explicit teardown here. deploy: - # workflow_run fires on every Release completion, including - # failures. Only promote on success. + # workflow_run fires on every Release completion, including failures. + # Only promote on success. if: github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'success' - runs-on: ubuntu-latest - environment: production permissions: contents: read id-token: write - steps: - - uses: actions/checkout@v4 - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' - - uses: google-github-actions/setup-gcloud@v2 - - - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - DD_GITHUB_CLIENT_ID: ${{ vars.DD_GITHUB_CLIENT_ID || secrets.DD_GITHUB_CLIENT_ID }} - DD_GITHUB_CALLBACK_URL: ${{ vars.DD_GITHUB_CALLBACK_URL }} - DD_GITHUB_CLIENT_SECRET: ${{ secrets.DD_GITHUB_CLIENT_SECRET }} - # Intel Trust Authority — optional. When the secret is set, - # the CP mints its own ITA token and verifies incoming agent - # registrations. DD_ITA_REQUIRED stays false (default). - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - # workflow_run has no `inputs`; fall back to `latest`, which - # release.yml just (re)published on push to main. - DD_RELEASE_TAG: ${{ inputs.release_tag || 'latest' }} - run: scripts/gcp-deploy.sh - - - name: Wait for agent health - env: - AGENT_URL: https://app.${{ env.DD_DOMAIN }} - run: | - for i in $(seq 1 60); do - curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1 && { - echo "Agent healthy at ${AGENT_URL}" - exit 0 - } - echo " waiting for tunnel... (${i}/60)" - sleep 5 - done - echo "::error::Agent not healthy within 5 minutes" - exit 1 - - - name: Verify dashboard renders - env: - AGENT_URL: https://app.${{ env.DD_DOMAIN }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # New auth model: dashboard expects a GitHub PAT/GITHUB_TOKEN with - # access to the dd repo; the CP verifies against DD_OWNER via the - # standard /user + /repos/{owner}/dd fallback. No OIDC audience wiring. - for attempt in $(seq 1 12); do - code=$(curl -s -o /dev/null -w '%{http_code}' \ - -H "Authorization: Bearer ${GITHUB_TOKEN}" \ - "${AGENT_URL}/" || echo 000) - if [ "$code" = "200" ]; then - echo "Dashboard renders (HTTP 200, attempt ${attempt})" - exit 0 - fi - echo " dashboard returned HTTP ${code}, retrying... (${attempt}/12)" - sleep 5 - done - echo "::error::dashboard / never returned 200 (last HTTP ${code})" - exit 1 - - - name: Verify STONITH halted prior production VM(s) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - # Mirror of release.yml's verify-step for PR previews. Give - # STONITH-by-tunnel-delete 120s to work on well-behaved old - # prod VMs (their cloudflared exits → dd-register poweroffs - # → GCP TERMINATED → cleanup.yml reaps). After the timeout, - # force-delete any remaining RUNNING prod VMs so we don't - # leak compute indefinitely. - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=production" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=production AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in prod" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out in prod; force-deleting:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only production VM" + # Granted (though unused — inputs.comment_on_pr=false here) so the + # intersection with deploy-cp.yml's job-level permissions matches. + pull-requests: write + uses: ./.github/workflows/deploy-cp.yml + with: + env: production + hostname: app.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + gcp_environment: production + workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' + service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' + # workflow_run has no `inputs`; fall back to `latest`, which + # release.yml just (re)published on push to main. + release_tag: ${{ inputs.release_tag || 'latest' }} + oauth_enabled: true + comment_on_pr: false + ref: main + secrets: inherit diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b238c85..efe4539 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,10 +31,6 @@ permissions: id-token: write attestations: write -env: - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - GCP_ZONE: us-central1-c - jobs: build: runs-on: ubuntu-latest @@ -121,223 +117,25 @@ jobs: # Each PR gets its own env at pr-{N}.{domain} with DD_ENV=pr-{N} # (hostname-isolated, no OAuth — browser access via /auth/pat). # main/v* produce releases that production-deploy picks up separately. + # + # Body lives in deploy-cp.yml — same workflow prod uses, so every PR + # exercises the prod deploy path. deploy-preview: if: github.event_name == 'pull_request' needs: build - runs-on: ubuntu-latest - environment: staging permissions: contents: read id-token: write pull-requests: write - env: - DD_ENV: pr-${{ github.event.number }} - DD_HOSTNAME: pr-${{ github.event.number }}.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - steps: - - uses: actions/checkout@v4 - - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' - - uses: google-github-actions/setup-gcloud@v2 - - - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - # OAuth env vars intentionally omitted — gcp-deploy.sh sees - # empty DD_GITHUB_CLIENT_ID and skips them in the workload - # spec. dd-web then disables /auth/github/* and serves - # /auth/pat for browser access. - # - # Intel Trust Authority — optional. When the secret is set, - # the CP mints its own ITA token at startup and verifies - # agent-supplied tokens on /register. DD_ITA_REQUIRED stays - # false (default) so unsigned agents still register. - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - DD_RELEASE_TAG: ${{ needs.build.outputs.tag }} - run: scripts/gcp-deploy.sh - - - name: Wait for agent health (streams serial console) - env: - AGENT_URL: https://${{ env.DD_HOSTNAME }} - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - VM_NAME=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - if [ -z "$VM_NAME" ]; then - echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed" - exit 1 - fi - echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)" - - LAST_LINES=0 - for i in $(seq 1 60); do - # Stream serial console so boot failures (DHCP hang, GitHub - # release fetch error, cloudflared exit, etc.) are visible - # without shelling into GCP. - gcloud compute instances get-serial-port-output "$VM_NAME" \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \ - > /tmp/serial.log || true - TOTAL_LINES=$(wc -l < /tmp/serial.log) - if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then - tail -n +$((LAST_LINES + 1)) /tmp/serial.log \ - | sed 's/^/[serial] /' - LAST_LINES=$TOTAL_LINES - fi - - if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then - echo "::error::boot failed — serial log shows fatal pattern" - exit 1 - fi - - # /health via the Cloudflare tunnel tests the full chain: - # VM boot → easyenclave init → github_release fetch of dd + - # cloudflared → cloudflared tunnel up. - if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then - echo "Agent healthy at ${AGENT_URL}" - exit 0 - fi - echo " waiting for tunnel... (${i}/60)" - sleep 5 - done - echo "::error::Agent not healthy within 5 minutes" - echo "--- final serial tail ---" - tail -80 /tmp/serial.log | sed 's/^/[serial] /' - exit 1 - - - name: Verify NEW VM via TDX attestation - env: - AGENT_URL: https://${{ env.DD_HOSTNAME }} - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # /cp/attest proves the freshly-deployed VM is serving the tunnel - # (stale tunnels point at old VMs that 404 on this endpoint). - # Auth: GITHUB_TOKEN via Bearer — the CP's /repos/{owner}/dd probe - # accepts any token with repo access. No OIDC audience wiring. - NONCE=$(openssl rand -base64 16) - - # 60 × 10s = 10 min. New VM has to boot, fetch cloudflared - # and dd from GitHub releases, start, and bring its tunnel up. - for attempt in $(seq 1 60); do - BODY=$(curl -sG -w '\n%{http_code}' \ - -H "Authorization: Bearer ${GITHUB_TOKEN}" \ - --data-urlencode "nonce=${NONCE}" \ - "${AGENT_URL}/cp/attest" || echo $'\n000') - CODE=$(echo "$BODY" | tail -n1) - JSON=$(echo "$BODY" | sed '$d') - if [ "$CODE" = "200" ]; then - QUOTE_B64=$(echo "$JSON" | jq -r '.quote_b64 // empty') - if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then - # MRTD = 48 bytes at offset 184 in TDX quote v4. - # If it's non-zero, attestation actually worked. - MRTD=$(echo "$QUOTE_B64" | base64 -d \ - | dd bs=1 skip=184 count=48 status=none | xxd -p -c 48) - if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then - echo "NEW VM verified — MRTD: $MRTD" - exit 0 - fi - echo " /cp/attest 200 but MRTD empty/zero, retrying... (${attempt}/60)" - else - echo " /cp/attest 200 but no quote_b64, retrying... (${attempt}/60)" - fi - else - echo " /cp/attest returned HTTP ${CODE}, retrying... (${attempt}/60)" - fi - sleep 10 - done - echo "::error::/cp/attest never returned a valid quote — stale tunnel or new VM never came up" - exit 1 - - - name: Verify STONITH halted prior VM(s) in this env - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - GCP_ZONE: ${{ env.GCP_ZONE }} - run: | - # STONITH (dd-register deletes the old tunnel → old cloudflared - # exits → old dd-register poweroffs the VM) is the ONLY cleanup - # mechanism. Scoped to this PR's env — previews are - # hostname-isolated from each other, so this only reaps prior - # deploys of the same PR (re-pushes). - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - - # Give STONITH-by-tunnel-delete 120s to work on well-behaved - # old VMs (their cloudflared exits → dd-register poweroffs). - # After that, force-delete any remaining survivors: they're - # zombies whose dd-register failed before creating a tunnel - # (e.g. CF auth error at boot — see src/cp.rs - # which now kernel_poweroff's on init failure, so this is a - # safety net for pre-fix zombies and any future init failure - # modes we haven't handled). - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in dd_env=${DD_ENV}" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only $DD_ENV VM" - - - name: Comment preview URL on PR - uses: actions/github-script@v7 - with: - script: | - const url = `https://${process.env.DD_HOSTNAME}`; - const body = [ - `### DD preview ready`, - ``, - `**URL:** ${url}`, - ``, - `Browser login: paste \`gh auth token\` output at ${url}/auth/pat`, - ``, - `CLI / curl: \`curl -H "Authorization: Bearer $(gh auth token)" ${url}/\``, - ``, - `Register endpoint for a local agent: \`wss://${process.env.DD_HOSTNAME}/register\``, - ].join('\n'); - - // Update existing bot comment if present, else create. - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - const marker = '### DD preview ready'; - const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker)); - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body, - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body, - }); - } + uses: ./.github/workflows/deploy-cp.yml + with: + env: pr-${{ github.event.number }} + hostname: pr-${{ github.event.number }}.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} + gcp_environment: staging + workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' + service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' + release_tag: ${{ needs.build.outputs.tag }} + oauth_enabled: false + comment_on_pr: true + ref: ${{ github.event.pull_request.head.ref }} + secrets: inherit diff --git a/apps/cloudflared/workload.json b/apps/cloudflared/workload.json new file mode 100644 index 0000000..1b2270a --- /dev/null +++ b/apps/cloudflared/workload.json @@ -0,0 +1,8 @@ +{ + "app_name": "cloudflared", + "github_release": { + "repo": "cloudflare/cloudflared", + "asset": "cloudflared-linux-amd64", + "rename": "cloudflared" + } +} diff --git a/apps/dd-agent/workload.json.tmpl b/apps/dd-agent/workload.json.tmpl new file mode 100644 index 0000000..a0e6d04 --- /dev/null +++ b/apps/dd-agent/workload.json.tmpl @@ -0,0 +1,22 @@ +{ + "app_name": "dd-agent", + "github_release": { + "repo": "devopsdefender/dd", + "asset": "devopsdefender", + "tag": "latest" + }, + "cmd": ["devopsdefender", "agent"], + "env": [ + "DD_MODE=agent", + "DD_CP_URL=${DD_CP_URL}", + "DD_PAT=${DD_PAT}", + "DD_ITA_API_KEY=${DD_ITA_API_KEY}", + "DD_ITA_BASE_URL=https://api.trustauthority.intel.com", + "DD_ITA_JWKS_URL=https://portal.trustauthority.intel.com/certs", + "DD_ITA_ISSUER=https://portal.trustauthority.intel.com", + "DD_OWNER=devopsdefender", + "DD_ENV=${DD_ENV}", + "DD_VM_NAME=${DD_VM_NAME}", + "DD_PORT=8080" + ] +} diff --git a/apps/dd-management/workload.json.tmpl b/apps/dd-management/workload.json.tmpl new file mode 100644 index 0000000..e8fbe17 --- /dev/null +++ b/apps/dd-management/workload.json.tmpl @@ -0,0 +1,29 @@ +{ + "app_name": "dd-management", + "github_release": { + "repo": "devopsdefender/dd", + "asset": "devopsdefender", + "tag": "${DD_RELEASE_TAG}" + }, + "cmd": ["devopsdefender"], + "env": [ + "DD_MODE=management", + "DD_CF_API_TOKEN=${CLOUDFLARE_API_TOKEN}", + "DD_CF_ACCOUNT_ID=${CLOUDFLARE_ACCOUNT_ID}", + "DD_CF_ZONE_ID=${CLOUDFLARE_ZONE_ID}", + "DD_CF_DOMAIN=${DD_DOMAIN}", + "DD_HOSTNAME=${DD_HOSTNAME}", + "DD_ENV=${DD_ENV}", + "DD_OWNER=devopsdefender", + "DD_REGISTER_PORT=8081", + "DD_OIDC_AUDIENCE=dd-web", + "DD_PORT=8080", + "DD_GITHUB_CLIENT_ID=${DD_GITHUB_CLIENT_ID}", + "DD_GITHUB_CLIENT_SECRET=${DD_GITHUB_CLIENT_SECRET}", + "DD_GITHUB_CALLBACK_URL=${DD_GITHUB_CALLBACK_URL}", + "DD_ITA_API_KEY=${DD_ITA_API_KEY}", + "DD_ITA_BASE_URL=${DD_ITA_BASE_URL}", + "DD_ITA_JWKS_URL=${DD_ITA_JWKS_URL}", + "DD_ITA_ISSUER=${DD_ITA_ISSUER}" + ] +} diff --git a/apps/mount-models/workload.json b/apps/mount-models/workload.json new file mode 100644 index 0000000..94111d8 --- /dev/null +++ b/apps/mount-models/workload.json @@ -0,0 +1,7 @@ +{ + "app_name": "mount-models", + "cmd": [ + "/bin/busybox", "sh", "-c", + "mkdir -p /var/lib/easyenclave/ollama && mount /dev/vdc /var/lib/easyenclave/ollama && echo mount-models: ok; sleep inf" + ] +} diff --git a/apps/nv/workload.json b/apps/nv/workload.json new file mode 100644 index 0000000..047ed3d --- /dev/null +++ b/apps/nv/workload.json @@ -0,0 +1,7 @@ +{ + "app_name": "nv", + "cmd": [ + "/bin/busybox", "sh", "-c", + "/sbin/insmod /lib/modules/7.0.0-14-generic/kernel/nvidia-580srv-open/nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 2>&1 && echo nv: loaded || echo nv: failed; sleep inf" + ] +} diff --git a/apps/ollama/workload.preview.json b/apps/ollama/workload.preview.json new file mode 100644 index 0000000..8455622 --- /dev/null +++ b/apps/ollama/workload.preview.json @@ -0,0 +1,7 @@ +{ + "app_name": "ollama", + "cmd": [ + "/bin/busybox", "sh", "-c", + "until [ -x /var/lib/easyenclave/bin/dd-podman ]; do sleep 2; done\nexec /var/lib/easyenclave/bin/dd-podman run --rm --name ollama --network=host -v /var/lib/easyenclave/ollama:/root/.ollama -e OLLAMA_HOST=127.0.0.1:11434 docker.io/ollama/ollama:latest serve" + ] +} diff --git a/apps/ollama/workload.prod.json b/apps/ollama/workload.prod.json new file mode 100644 index 0000000..eae4a9a --- /dev/null +++ b/apps/ollama/workload.prod.json @@ -0,0 +1,7 @@ +{ + "app_name": "ollama", + "cmd": [ + "/bin/busybox", "sh", "-c", + "until [ -x /var/lib/easyenclave/bin/dd-podman ]; do sleep 2; done\nexec /var/lib/easyenclave/bin/dd-podman run --rm --name ollama --network=host --device=/dev/nvidia0 --device=/dev/nvidiactl --device=/dev/nvidia-uvm -v /var/lib/easyenclave/ollama:/root/.ollama -e OLLAMA_HOST=127.0.0.1:11434 docker.io/ollama/ollama:latest serve" + ] +} diff --git a/apps/openclaw/workload.json.tmpl b/apps/openclaw/workload.json.tmpl new file mode 100644 index 0000000..6f9087d --- /dev/null +++ b/apps/openclaw/workload.json.tmpl @@ -0,0 +1,7 @@ +{ + "app_name": "openclaw", + "cmd": [ + "/bin/busybox", "sh", "-c", + "echo 'openclaw: waiting for ollama on 127.0.0.1:11434...'\ni=0\nuntil /bin/busybox wget -q -T 3 -O- http://127.0.0.1:11434/api/tags >/dev/null 2>&1; do\n i=$((i+1))\n if [ $((i % 6)) -eq 0 ]; then echo \"openclaw: still waiting for ollama ($i tries, ${i}x5s elapsed)\"; fi\n sleep 5\ndone\necho 'openclaw: ollama responding, pulling model ${MODEL}'\n/var/lib/easyenclave/bin/dd-podman exec ollama ollama pull ${MODEL} 2>&1\necho 'openclaw: model pulled, launching gateway'\nexec /var/lib/easyenclave/bin/dd-podman exec ollama ollama launch openclaw --model ${MODEL} --yes" + ] +} diff --git a/apps/podman-bootstrap/workload.json b/apps/podman-bootstrap/workload.json new file mode 100644 index 0000000..5a797e4 --- /dev/null +++ b/apps/podman-bootstrap/workload.json @@ -0,0 +1,7 @@ +{ + "app_name": "podman-bootstrap", + "cmd": [ + "/bin/busybox", "sh", "-c", + "set -e\nBIN=/var/lib/easyenclave/bin\nSRC=$BIN/podman-linux-amd64\nuntil [ -x $SRC/usr/local/bin/podman ]; do sleep 1; done\n# If there's a vdc scratch disk, wait for mount-models to actually\n# mount it before we write files under /var/lib/easyenclave/ollama —\n# otherwise our writes land on the rootfs tmpfs and get shadowed the\n# moment vdc is mounted. On VMs without vdc (GCP CP preview) there's\n# no mount-models workload and this check short-circuits.\nif [ -b /dev/vdc ]; then\n until mountpoint -q /var/lib/easyenclave/ollama 2>/dev/null; do sleep 1; done\nfi\nmkdir -p /var/lib/easyenclave/ollama\ncp -f $SRC/usr/local/bin/* $BIN/\ncp -f $SRC/usr/local/lib/podman/conmon $BIN/\ncp -f $SRC/usr/local/lib/podman/netavark $BIN/ 2>/dev/null || true\ncp -f $SRC/usr/local/lib/podman/aardvark-dns $BIN/ 2>/dev/null || true\ncp -f $SRC/usr/local/lib/podman/rootlessport $BIN/ 2>/dev/null || true\nmkdir -p /var/lib/easyenclave/ollama/.podman/storage /var/lib/easyenclave/ollama/.podman/runroot\n# /dev/shm is where podman puts its per-container POSIX shm lock\n# file (libpod_lock). EE's guest rootfs may not mount tmpfs on\n# /dev/shm; without it, podman fails 'failed to create 2048 locks\n# in /libpod_lock: no such file or directory'. mkdir + mount idempotently.\nif ! mountpoint -q /dev/shm 2>/dev/null; then\n mkdir -p /dev/shm\n mount -t tmpfs -o size=64M tmpfs /dev/shm 2>/dev/null || true\nfi\n# Pick storage driver based on substrate. vdc-backed ext4 supports\n# native overlay (fast + space-efficient). Without vdc (GCP CP\n# preview, any guest running on tmpfs rootfs), overlay-on-tmpfs\n# errors out, so fall back to vfs (slower, full copy per layer, but\n# works on any filesystem).\nif mountpoint -q /var/lib/easyenclave/ollama; then\n DRIVER=overlay\nelse\n DRIVER=vfs\nfi\n# Write containers.conf on vdc (writable). /etc is RO on EE so we\n# can't put it where podman looks by default. helper_binaries_dir\n# tells podman where we staged conmon/netavark/aardvark-dns/… —\n# podman probes those at startup even with --network=host.\nPOL=/var/lib/easyenclave/ollama/.podman/policy.json\n# Minimum viable signature policy: trust anything. EE's attestation\n# story happens one layer up (image digest pinned by the spec we\n# baked); podman's own signature checking would duplicate that.\nprintf '%s' '{\"default\":[{\"type\":\"insecureAcceptAnything\"}]}' > $POL\n# Podman's containers-common looks for policy.json at hardcoded\n# paths (/etc/containers/, $HOME/.config/containers/). /etc and\n# /root are both RO on EE, so build a fake HOME under\n# /var/lib/easyenclave/.home (writable) and set HOME there in the\n# dd-podman wrapper.\nHOME_DIR=/var/lib/easyenclave/.home\nmkdir -p $HOME_DIR/.config/containers\ncp -f $POL $HOME_DIR/.config/containers/policy.json\nCONF=/var/lib/easyenclave/ollama/.podman/containers.conf\nprintf '%s\\n' '[engine]' 'helper_binaries_dir = [\"/var/lib/easyenclave/bin\"]' > $CONF\nmkdir -p $HOME_DIR/tmp\nprintf '%s\\n' '#!/bin/sh' \"export HOME=$HOME_DIR\" \"export TMPDIR=$HOME_DIR/tmp\" \"export CONTAINERS_CONF=$CONF\" \"exec /var/lib/easyenclave/bin/podman --conmon=/var/lib/easyenclave/bin/conmon --runtime=/var/lib/easyenclave/bin/crun --storage-driver=$DRIVER --root=/var/lib/easyenclave/ollama/.podman/storage --runroot=/var/lib/easyenclave/ollama/.podman/runroot --cgroup-manager=cgroupfs \\\"\\$@\\\"\" > $BIN/dd-podman\nchmod +x $BIN/dd-podman\nls -la $CONF $POL $BIN/dd-podman 2>&1 || true\ncat $CONF\necho podman-bootstrap: v2 ok driver=$DRIVER conf=$CONF policy=$POL" + ] +} diff --git a/apps/podman-static/workload.json b/apps/podman-static/workload.json new file mode 100644 index 0000000..939125d --- /dev/null +++ b/apps/podman-static/workload.json @@ -0,0 +1,7 @@ +{ + "app_name": "podman-static", + "github_release": { + "repo": "mgoltzsche/podman-static", + "asset": "podman-linux-amd64.tar.gz" + } +} diff --git a/scripts/dd-relaunch.sh b/scripts/dd-relaunch.sh index bdf1d8d..a118618 100755 --- a/scripts/dd-relaunch.sh +++ b/scripts/dd-relaunch.sh @@ -16,6 +16,7 @@ set -euo pipefail KIND="${1?usage: dd-relaunch.sh }" CP="${2?cp url required}" +REF="${3:-main}" : "${DD_PAT?DD_PAT must be set}" : "${DD_ITA_API_KEY?DD_ITA_API_KEY must be set}" @@ -30,8 +31,11 @@ cd /home/tdx2/src/dd # dirty working tree elsewhere doesn't block the deploy. The relaunch # script itself has already been read into memory by bash, so the # update takes effect on the *next* invocation. -git fetch --quiet origin main -git checkout --quiet origin/main -- scripts/local-agents.sh scripts/dd-relaunch.sh +git fetch --quiet origin "$REF" +git checkout --quiet "origin/$REF" -- scripts/local-agents.sh scripts/dd-relaunch.sh +git checkout --quiet "origin/$REF" -- scripts/workloads.sh 2>/dev/null || true +git checkout --quiet "origin/$REF" -- apps/ 2>/dev/null || true +echo "dd-relaunch: refreshed scripts + apps/ from origin/$REF" vm="dd-local-$KIND" overlay="/var/lib/libvirt/images/$vm.qcow2" diff --git a/scripts/gcp-deploy.sh b/scripts/gcp-deploy.sh index 96eed6b..a65a378 100755 --- a/scripts/gcp-deploy.sh +++ b/scripts/gcp-deploy.sh @@ -79,74 +79,45 @@ DD_ITA_JWKS_URL="${DD_ITA_JWKS_URL:-https://portal.trustauthority.intel.com/cert DD_ITA_ISSUER="${DD_ITA_ISSUER:-https://portal.trustauthority.intel.com}" # ── Build the workload spec ────────────────────────────────────────────── -# Two boot workloads: -# 1. cloudflared — fetch-only. easyenclave downloads cloudflare's -# static binary from their GitHub release, symlinks it as -# `cloudflared`, and exits the deploy as "completed". The binary -# sits on PATH for dd-register to spawn. -# 2. dd-management — fetches the devopsdefender binary from our own -# release and runs it. dd-register + dd-web both live in this -# single process (DD_MODE=management). -EE_BOOT_WORKLOADS=$(jq -c -n \ - --arg dd_tag "$DD_RELEASE_TAG" \ - --arg cf_token "$CLOUDFLARE_API_TOKEN" \ - --arg cf_account "$CLOUDFLARE_ACCOUNT_ID" \ - --arg cf_zone "$CLOUDFLARE_ZONE_ID" \ - --arg domain "$DD_DOMAIN" \ - --arg hostname "$DD_HOSTNAME" \ - --arg env "$DD_ENV" \ - --arg gh_client_id "$DD_GITHUB_CLIENT_ID" \ - --arg gh_client_secret "$DD_GITHUB_CLIENT_SECRET" \ - --arg gh_callback "$DD_GITHUB_CALLBACK_URL" \ - --arg ita_api_key "$DD_ITA_API_KEY" \ - --arg ita_base_url "$DD_ITA_BASE_URL" \ - --arg ita_jwks_url "$DD_ITA_JWKS_URL" \ - --arg ita_issuer "$DD_ITA_ISSUER" \ - '[ - { - "github_release": { - "repo": "cloudflare/cloudflared", - "asset": "cloudflared-linux-amd64", - "rename": "cloudflared" - }, - "app_name": "cloudflared" - }, - { - "github_release": { - "repo": "devopsdefender/dd", - "asset": "devopsdefender", - "tag": $dd_tag - }, - "cmd": ["devopsdefender"], - "app_name": "dd-management", - "env": ( - [ - "DD_MODE=management", - ("DD_CF_API_TOKEN=" + $cf_token), - ("DD_CF_ACCOUNT_ID=" + $cf_account), - ("DD_CF_ZONE_ID=" + $cf_zone), - ("DD_CF_DOMAIN=" + $domain), - ("DD_HOSTNAME=" + $hostname), - ("DD_ENV=" + $env), - "DD_OWNER=devopsdefender", - "DD_REGISTER_PORT=8081", - "DD_OIDC_AUDIENCE=dd-web", - "DD_PORT=8080" - ] - + (if $gh_client_id == "" then [] else [ - ("DD_GITHUB_CLIENT_ID=" + $gh_client_id), - ("DD_GITHUB_CLIENT_SECRET=" + $gh_client_secret), - ("DD_GITHUB_CALLBACK_URL=" + $gh_callback) - ] end) - + [ - ("DD_ITA_API_KEY=" + $ita_api_key), - ("DD_ITA_BASE_URL=" + $ita_base_url), - ("DD_ITA_JWKS_URL=" + $ita_jwks_url), - ("DD_ITA_ISSUER=" + $ita_issuer) - ] - ) - } - ]') +# Boot workloads come from apps//workload.{json,json.tmpl}. Same +# file per workload whether this CP runs in prod, staging, or a PR +# preview; only the env-var substitutions differ. +# +# cloudflared — fetch-only, puts the binary on PATH for DD to spawn. +# dd-management — devopsdefender in DD_MODE=management (CP + dashboard). +# +# Empty ${DD_GITHUB_CLIENT_ID} etc produce empty "KEY=" strings; the +# bake helper strips those so the resulting spec matches the old +# `if $gh_client_id == "" then [] else [...]` conditional. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +# shellcheck source=./workloads.sh +source "$SCRIPT_DIR/workloads.sh" +EE_BOOT_WORKLOADS=$( + DD_RELEASE_TAG="$DD_RELEASE_TAG" \ + CLOUDFLARE_API_TOKEN="$CLOUDFLARE_API_TOKEN" \ + CLOUDFLARE_ACCOUNT_ID="$CLOUDFLARE_ACCOUNT_ID" \ + CLOUDFLARE_ZONE_ID="$CLOUDFLARE_ZONE_ID" \ + DD_DOMAIN="$DD_DOMAIN" \ + DD_HOSTNAME="$DD_HOSTNAME" \ + DD_ENV="$DD_ENV" \ + DD_GITHUB_CLIENT_ID="$DD_GITHUB_CLIENT_ID" \ + DD_GITHUB_CLIENT_SECRET="$DD_GITHUB_CLIENT_SECRET" \ + DD_GITHUB_CALLBACK_URL="$DD_GITHUB_CALLBACK_URL" \ + DD_ITA_API_KEY="$DD_ITA_API_KEY" \ + DD_ITA_BASE_URL="$DD_ITA_BASE_URL" \ + DD_ITA_JWKS_URL="$DD_ITA_JWKS_URL" \ + DD_ITA_ISSUER="$DD_ITA_ISSUER" \ + join \ + "$REPO_ROOT/apps/cloudflared/workload.json" \ + "$REPO_ROOT/apps/dd-management/workload.json.tmpl" +) +# ollama + openclaw are NOT baked into the CP preview. EE's tmpfs +# /var/lib/easyenclave is too small for the 900 MB container image, +# and attaching a scratch PD here would duplicate what the local +# dd-local-preview VM already provides via its vdc ext4 disk. The +# preview CP stays slim; the ollama+openclaw demo registers from +# dd-local-preview (scripts/local-agents.sh). # ── Wrap into ee-config ─────────────────────────────────────────────────── jq -c -n \ diff --git a/scripts/local-agents.sh b/scripts/local-agents.sh index 17c61fa..341cc9e 100755 --- a/scripts/local-agents.sh +++ b/scripts/local-agents.sh @@ -255,3 +255,8 @@ echo echo "watch registration (Ctrl-] to exit):" [ -n "$PREVIEW_CP" ] && echo " virsh console dd-local-preview" [ -n "$PROD_CP" ] && echo " virsh console dd-local-prod" + +# Explicit 0 — the tail `[ -n "$PROD_CP" ] && …` returns 1 when +# PROD_CP="" (preview-only), bubbling up as the script exit status +# and tripping set -e in dd-relaunch.sh. Force success. +exit 0 diff --git a/scripts/ollama-deploy.sh b/scripts/ollama-deploy.sh deleted file mode 100755 index 4d4babf..0000000 --- a/scripts/ollama-deploy.sh +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env bash -# ollama-deploy.sh — run ollama + OpenClaw inside a DD agent VM as -# podman containers. No ollama binary on the guest rootfs (that's -# dynamically linked and fails on EE's busybox rootfs with -# `libstdc++.so.6: cannot open shared object file`). Instead: -# -# 1. Fetch static podman (mgoltzsche/podman-static tarball) as a -# fetch-only DD workload. -# 2. One-shot bootstrap via /exec — flatten the tarball's nested -# bin dir into /var/lib/easyenclave/bin and write a minimal -# /etc/containers/containers.conf (cgroup_manager=cgroupfs so -# we don't need systemd). -# 3. Deploy the ollama container as a long-running workload -# (podman run --net=host ...). Prod also passes the three -# nvidia device nodes for H100 access. -# 4. Pull the right-sized model via `podman exec ollama ollama pull`. -# 5. Launch OpenClaw (a bridge from messaging apps to coding -# agents; subcommand of ollama, npm-installed on first run) as -# a second long-running workload using the same container. -# -# ollama-deploy.sh -# kind: prod | preview -# cp_url: https://app.devopsdefender.com | https://pr-N.devopsdefender.com -# -# Requires DD_PAT in the environment (the workflow's GITHUB_TOKEN). - -set -euo pipefail - -KIND="${1?usage: ollama-deploy.sh }" -CP_URL="${2?cp_url required}" -: "${DD_PAT?}" - -case "$KIND" in - prod) - MODEL="llama3.1:8b" - # GPU passthrough. /dev/nvidia-uvm appears once CUDA is touched; - # the nv-insmod boot workload in scripts/local-agents.sh loads - # the kernel module, so the device nodes exist by this point. - GPU_FLAGS='["--device=/dev/nvidia0","--device=/dev/nvidiactl","--device=/dev/nvidia-uvm"]' - ;; - preview) - MODEL="qwen2.5:0.5b" - GPU_FLAGS='[]' - ;; - *) echo "unknown kind: $KIND" >&2; exit 2 ;; -esac - -VM_NAME="dd-local-$KIND" -AUTH=(-H "Authorization: Bearer $DD_PAT") - -echo "== ollama-deploy $VM_NAME (model=$MODEL, cp=$CP_URL) ==" - -# ── 1. Discover the fresh agent registration on the CP ───────────── -# last_seen > started_at_iso filters out stale entries from the VM -# generation we just destroyed during `virsh destroy`. -started_at_iso="$(date -u +%Y-%m-%dT%H:%M:%SZ)" -echo " waiting for a fresh ${VM_NAME} registration (last_seen > ${started_at_iso})" -agent_host="" -for i in $(seq 1 60); do - agent_host=$(curl -fsS "${AUTH[@]}" "$CP_URL/api/agents" 2>/dev/null \ - | jq -r --arg vm "$VM_NAME" --arg since "$started_at_iso" ' - [.[] | select(.vm_name==$vm and .status=="healthy" and .last_seen > $since)] - | sort_by(.last_seen) | reverse | .[0].hostname // empty' 2>/dev/null || true) - if [ -n "$agent_host" ] && [ "$agent_host" != "null" ]; then - break - fi - sleep 10 -done -if [ -z "$agent_host" ] || [ "$agent_host" = "null" ]; then - echo "ERROR: $VM_NAME never appeared in CP fleet" >&2 - exit 1 -fi -echo " agent: https://$agent_host" - -# ── 2. Wait for Cloudflare DNS to propagate ──────────────────────── -echo " waiting for DNS on $agent_host..." -for i in $(seq 1 30); do - if getent hosts "$agent_host" >/dev/null 2>&1; then - echo " DNS resolved" - break - fi - sleep 5 -done - -agent() { curl -fsS --max-time 300 "${AUTH[@]}" "https://$agent_host$1" "${@:2}"; } - -# ── 3. Fetch podman-static (fetch-only DD workload) ──────────────── -# Tarball unpacks to /var/lib/easyenclave/bin/podman-linux-amd64/ -# with usr/local/bin/{podman,crun,conmon,netavark,...}. -# NOTE: omit `tag` — EE treats `tag: null` as "GET /releases/latest" -# (the real newest release), while `tag: "latest"` is a literal tag -# lookup and 404s on repos like mgoltzsche/podman-static that version -# their tags as v5.7.1 rather than with a rolling "latest" ref. -echo " POST /deploy podman-static..." -A_SPEC=$(jq -c -n '{ - app_name: "podman-static", - github_release: { - repo: "mgoltzsche/podman-static", - asset: "podman-linux-amd64.tar.gz" - } -}') -agent /deploy -H 'Content-Type: application/json' -d "$A_SPEC" | jq -c '.' || true - -echo " waiting for podman binary to appear..." -podman_path="/var/lib/easyenclave/bin/podman-linux-amd64/usr/local/bin/podman" -for i in $(seq 1 60); do - resp=$(agent /exec -H 'Content-Type: application/json' \ - -d "$(jq -c -n --arg p "$podman_path" '{cmd:["/bin/busybox","sh","-c",("test -x " + $p + " && echo found")],timeout_secs:5}')" \ - 2>/dev/null || true) - if echo "$resp" | grep -q found; then - echo " podman unpacked" - break - fi - sleep 5 -done - -# ── 4. Bootstrap: stage podman's helper binaries ─────────────────── -# mgoltzsche's tarball layout: -# usr/local/bin/ podman, crun, runc, fuse-overlayfs, -# fusermount3, pasta, pasta.avx2 -# usr/local/lib/podman/ conmon, netavark, aardvark-dns, -# rootlessport, catatonit -# EE's guest rootfs has BOTH /usr AND /etc mounted read-only. The -# only writable paths are under /var/lib/easyenclave (on the -# persistent vdc ext4 disk) and /run/tmp-style tmpfs locations. So -# we cannot write a containers.conf anywhere podman looks for one, -# and we cannot cp conmon into any of podman's hardcoded search -# dirs. Every path has to be on the podman CLI directly. -# -# We DO stage the helpers into /var/lib/easyenclave/bin so the -# container workload's `cmd[0]` can reach `podman`, and the -# --conmon / --runtime / --root / --runroot flags on the `podman` -# command (see step 5) point podman at the rest. -echo " bootstrapping podman (staging binaries to writable dirs)..." -bootstrap_sh='set -e -BIN=/var/lib/easyenclave/bin -SRC=$BIN/podman-linux-amd64 -cp -f $SRC/usr/local/bin/* $BIN/ -cp -f $SRC/usr/local/lib/podman/conmon $BIN/ -cp -f $SRC/usr/local/lib/podman/netavark $BIN/ 2>/dev/null || true -cp -f $SRC/usr/local/lib/podman/aardvark-dns $BIN/ 2>/dev/null || true -cp -f $SRC/usr/local/lib/podman/rootlessport $BIN/ 2>/dev/null || true -mkdir -p /var/lib/easyenclave/containers/storage /var/lib/easyenclave/containers/runroot -echo podman-bootstrap: ok' -boot_resp=$(agent /exec -H 'Content-Type: application/json' \ - -d "$(jq -c -n --arg s "$bootstrap_sh" '{cmd:["/bin/busybox","sh","-c",$s],timeout_secs:30}')") -if ! echo "$boot_resp" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo "ERROR: podman bootstrap failed" - echo "$boot_resp" | jq . - exit 1 -fi -echo " bootstrap: $(echo "$boot_resp" | jq -r '.stdout // ""' | tail -1)" - -# ── 5. Launch the ollama container (long-running workload) ───────── -# --net=host : ollama listens on guest's 127.0.0.1:11434. -# --name : so we can `podman exec ollama ...` by name. -# --cgroup-manager=cgroupfs: matches containers.conf, still required -# on the command line because podman doesn't always -# pick it up from the engine section when invoked -# outside systemd. -# Volume : /var/lib/easyenclave/ollama is the persistent vdc -# ext4 disk (mounted by the mount-models boot workload -# in local-agents.sh); doubles as ollama's model cache -# and openclaw's npm prefix. -echo " POST /deploy ollama container..." -# Every writable path (--root, --runroot, --conmon, --runtime) is -# on the CLI because EE's /etc and /usr are read-only — podman -# can't fall back on /etc/containers/containers.conf the way it -# normally does. Storage lives on the persistent vdc disk so the -# 900 MB ollama image pull survives VM relaunches. -# --cgroup-manager=cgroupfs because there's no systemd in the guest. -# --network=host so ollama's :11434 binds on the VM's loopback, -# reachable from other EE workloads (like openclaw) and via /exec. -OLLAMA_SPEC=$(jq -c -n --argjson gpu "$GPU_FLAGS" '{ - app_name: "ollama", - cmd: ([ - "/var/lib/easyenclave/bin/podman", - "--conmon=/var/lib/easyenclave/bin/conmon", - "--runtime=/var/lib/easyenclave/bin/crun", - "--root=/var/lib/easyenclave/containers/storage", - "--runroot=/var/lib/easyenclave/containers/runroot", - "--cgroup-manager=cgroupfs", - "run", - "--rm", "--name", "ollama", - "--network=host" - ] + $gpu + [ - "-v", "/var/lib/easyenclave/ollama:/root/.ollama", - "-e", "OLLAMA_HOST=127.0.0.1:11434", - "docker.io/ollama/ollama:latest", - "serve" - ]) -}') -agent /deploy -H 'Content-Type: application/json' -d "$OLLAMA_SPEC" | jq -c '.' || true - -# ── 6. Wait for ollama HTTP to come up inside the container ──────── -# `podman exec ollama ollama list` exits 0 once the server is ready. -# First run has to pull ~900 MB of container image, so allow plenty. -echo " waiting for ollama to be ready (first run pulls the image)..." -ollama_ready=0 -for i in $(seq 1 120); do - resp=$(agent /exec -H 'Content-Type: application/json' \ - -d '{"cmd":["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","--cgroup-manager=cgroupfs","exec","ollama","ollama","list"],"timeout_secs":15}' \ - 2>/dev/null || true) - if echo "$resp" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo " ollama responding" - ollama_ready=1 - break - fi - sleep 10 -done -if [ "$ollama_ready" = "0" ]; then - echo "ERROR: ollama container never became ready (20 min timeout)" - echo " most recent /exec response:" - echo "$resp" | jq . - echo " last 30 lines of 'podman ps -a' + 'podman logs ollama':" - agent /exec -H 'Content-Type: application/json' \ - -d '{"cmd":["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","ps","-a"],"timeout_secs":10}' | jq -r '.stdout // .stderr // ""' - agent /exec -H 'Content-Type: application/json' \ - -d '{"cmd":["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","logs","ollama"],"timeout_secs":10}' 2>&1 | jq -r '.stdout // .stderr // ""' | tail -30 - exit 1 -fi - -# ── 7. Pull the model ────────────────────────────────────────────── -echo " pulling $MODEL (this can take a few minutes)..." -pull_resp=$(agent /exec -H 'Content-Type: application/json' \ - -d "$(jq -c -n --arg m "$MODEL" '{ - cmd:["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","--cgroup-manager=cgroupfs","exec","ollama","ollama","pull",$m], - timeout_secs:1800 - }')") -if ! echo "$pull_resp" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo "ERROR: ollama pull $MODEL failed" - echo "$pull_resp" | jq . - exit 1 -fi -echo " pull: $(echo "$pull_resp" | jq -r '.stdout // "(no stdout)"' | tail -3)" - -# ── 8. Launch OpenClaw ───────────────────────────────────────────── -# `ollama launch openclaw` installs via npm on first run if missing -# and then stays foreground, so we register it as a second long- -# running workload. --yes accepts the install prompt non-interactively. -echo " POST /deploy openclaw..." -OPENCLAW_SPEC=$(jq -c -n --arg m "$MODEL" '{ - app_name: "openclaw", - cmd: [ - "/var/lib/easyenclave/bin/podman", - "--root=/var/lib/easyenclave/containers/storage", - "--runroot=/var/lib/easyenclave/containers/runroot", - "--cgroup-manager=cgroupfs", - "exec", "ollama", - "ollama", "launch", "openclaw", - "--model", $m, - "--yes" - ] -}') -agent /deploy -H 'Content-Type: application/json' -d "$OPENCLAW_SPEC" | jq -c '.' || true - -# ── 9. Confirm openclaw is up ─ three probes, weakest → strongest ── -# (a) EE lists `openclaw` in /health — proves the workload was -# accepted by the in-VM runtime. Flips green on fork, before -# npm-install finishes, so on its own it's weak. -# (b) GET http://127.0.0.1:18789/healthz (the OpenClaw gateway HTTP -# endpoint). Docs: https://docs.openclaw.ai/gateway/health. -# 200 with valid JSON = gateway has bound its port and is -# serving. The ollama container runs with --net=host so the -# loopback is the VM's loopback; we curl through `podman exec` -# so we hit the in-container curl (EE's busybox lacks one). -# (c) `openclaw agent --message "ping"` — the documented one-shot -# CLI. Goes through the running gateway, hands the prompt to -# the loaded model, returns the assistant reply. Exit 0 AND -# non-empty stdout = the full ollama → openclaw → model path -# works end-to-end. The reply gets echoed into the workflow -# log as proof of life. -echo " confirming openclaw workload is registered with EE..." -for i in $(seq 1 30); do - list=$(agent /health 2>/dev/null || true) - if echo "$list" | jq -e '.deployments // [] | index("openclaw")' >/dev/null 2>&1; then - echo " openclaw: registered" - break - fi - sleep 5 -done - -echo " waiting for openclaw gateway on http://127.0.0.1:18789/healthz..." -openclaw_live=0 -for i in $(seq 1 60); do - resp=$(agent /exec -H 'Content-Type: application/json' \ - -d '{"cmd":["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","--cgroup-manager=cgroupfs","exec","ollama","curl","-fsS","http://127.0.0.1:18789/healthz"],"timeout_secs":10}' \ - 2>/dev/null || true) - if echo "$resp" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo " openclaw: /healthz 200" - echo "$resp" | jq -r '.stdout // ""' | head -c 200 | sed 's/^/ /' - echo - openclaw_live=1 - break - fi - sleep 5 -done - -if [ "$openclaw_live" != "1" ]; then - echo "ERROR: openclaw /healthz never returned 200 (gateway didn't come up within 5 min)" - echo " last /exec response:" - echo "$resp" | jq -c '.' | head -c 500 - exit 1 -fi - -echo " sending a round-trip prompt: 'ping'" -chat=$(agent /exec -H 'Content-Type: application/json' \ - -d '{"cmd":["/var/lib/easyenclave/bin/podman","--root=/var/lib/easyenclave/containers/storage","--runroot=/var/lib/easyenclave/containers/runroot","--cgroup-manager=cgroupfs","exec","ollama","openclaw","agent","--message","ping","--thinking","low"],"timeout_secs":120}' \ - 2>/dev/null || true) -reply=$(echo "$chat" | jq -r '.stdout // ""') -if [ -z "$reply" ] || ! echo "$chat" | jq -e '.exit_code == 0' >/dev/null 2>&1; then - echo "ERROR: openclaw agent --message didn't return a reply" - echo " raw: $(echo "$chat" | jq -c '.' | head -c 500)" - exit 1 -fi -echo -echo "=== openclaw replied ===" -echo "$reply" -echo "========================" - -echo -echo "=== agent fleet summary ===" -echo " agent: https://$agent_host" -echo " model: $MODEL" -echo " ollama: podman container 'ollama' on host net, :11434" -echo " openclaw: http://127.0.0.1:18789 (gateway), replied to round-trip ping" -echo "===========================" diff --git a/scripts/redeploy-workload.sh b/scripts/redeploy-workload.sh new file mode 100755 index 0000000..e866d3f --- /dev/null +++ b/scripts/redeploy-workload.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# redeploy-workload.sh — POST one baked workload spec to a live agent's +# /deploy endpoint. Handy for iterating on apps//workload.json +# without rebuilding the whole config.iso + restarting the VM. +# +# Usage: +# redeploy-workload.sh +# +# Example: +# DD_PAT=$(gh auth token) \ +# ./scripts/redeploy-workload.sh \ +# https://app.devopsdefender.com \ +# dd-local-prod \ +# apps/openclaw/workload.json.tmpl +# +# Requires DD_PAT in env. Template envs (MODEL, DD_CP_URL, …) must +# also be exported if the referenced workload file is a .tmpl. + +set -euo pipefail + +CP_URL="${1?usage: redeploy-workload.sh }" +VM_NAME="${2?vm_name required (e.g. dd-local-prod)}" +APP_PATH="${3?app_path required (e.g. apps/openclaw/workload.json.tmpl)}" +: "${DD_PAT?set DD_PAT (e.g. DD_PAT=\$(gh auth token))}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=./workloads.sh +source "$SCRIPT_DIR/workloads.sh" + +AUTH=(-H "Authorization: Bearer $DD_PAT") + +# Discover the agent's tunnel hostname via CP's fleet API. +agent_host=$( + curl -fsS "${AUTH[@]}" "$CP_URL/api/agents" 2>/dev/null \ + | jq -r --arg vm "$VM_NAME" ' + [.[] | select(.vm_name==$vm and .status=="healthy")] + | sort_by(.last_seen) | reverse | .[0].hostname // empty' +) +if [ -z "$agent_host" ] || [ "$agent_host" = "null" ]; then + echo "ERROR: no healthy $VM_NAME in $CP_URL/api/agents" >&2 + exit 1 +fi +echo "agent: https://$agent_host" + +spec=$(bake "$APP_PATH") +echo "redeploying $(echo "$spec" | jq -r .app_name)..." +curl -fsS --max-time 60 "${AUTH[@]}" \ + "https://$agent_host/deploy" \ + -H 'Content-Type: application/json' \ + -d "$spec" | jq -c . diff --git a/scripts/workloads.sh b/scripts/workloads.sh new file mode 100755 index 0000000..c41fe47 --- /dev/null +++ b/scripts/workloads.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# workloads.sh — shared helpers for assembling EE workload specs. +# +# A DD "workload" is a JSON object with {app_name, github_release, +# cmd, env} that EE's DeployRequest consumes. Each app lives in +# apps//workload.json (literal) or apps//workload.json.tmpl +# (with ${VAR} placeholders substituted at bake time from the caller's +# environment). +# +# Public functions: +# bake — print one rendered workload to stdout. +# Plain .json is emitted as-is; .json.tmpl +# gets envsubst + empty-env-entry stripping. +# join [path…] — print a JSON array of rendered workloads. +# +# Sourced from scripts/local-agents.sh and scripts/gcp-deploy.sh so +# both scripts share one source of truth for the workload shape. + +# Render a single workload file. +# For .json files, passthrough. +# For .json.tmpl files, substitute ${VAR} from the current env, then +# remove any "KEY=" env array entries that ended up with an empty +# value (matches the conditional-include pattern gcp-deploy.sh used +# for DD_GITHUB_CLIENT_ID & co). +bake() { + local path="$1" + if [[ "$path" == *.json ]]; then + jq -c . "$path" + elif [[ "$path" == *.json.tmpl ]]; then + envsubst < "$path" \ + | jq -c 'if .env then .env |= map(select(. | test("^[^=]+=.+"))) else . end' + else + echo "workloads.sh: unknown workload file type: $path" >&2 + return 1 + fi +} + +# Print a JSON array of rendered workloads. +join() { + local out="[" + local first=1 + for p in "$@"; do + local rendered + rendered=$(bake "$p") || return 1 + if [ $first -eq 1 ]; then + out+="$rendered" + first=0 + else + out+=",$rendered" + fi + done + out+="]" + echo "$out" +}