From 74aa03c517b87a49b22f913296f04096d5640f9c Mon Sep 17 00:00:00 2001 From: alex newman Date: Tue, 28 Apr 2026 13:10:42 -0400 Subject: [PATCH] Rewrite around v2 agent ownership model --- .github/actions/assign/action.yml | 67 ++ .github/actions/dd-deploy/README.md | 81 -- .github/actions/dd-deploy/action.yml | 131 --- .github/actions/deploy/action.yml | 75 ++ .github/actions/relaunch-agent/action.yml | 219 ---- .github/actions/relaunch-cp/action.yml | 79 -- .github/actions/verify/action.yml | 69 ++ .github/workflows/ci.yml | 25 + .github/workflows/cleanup.yml | 353 ------ .github/workflows/deploy-cp.yml | 613 ---------- .github/workflows/release.yml | 208 ---- .github/workflows/set-agent-owner.yml | 100 -- .github/workflows/website-preview.yml | 23 - .nojekyll | 0 CNAME | 1 - Cargo.lock | 828 ++------------ Cargo.toml | 33 +- Dockerfile | 30 - README.md | 166 +-- apps/README.md | 169 --- apps/_infra/dd-relaunch-cp.sh | 56 - apps/_infra/dd-relaunch.sh | 71 -- apps/_infra/ee-sync.sh | 102 -- apps/_infra/local-agents.sh | 421 ------- apps/_infra/local-cp.sh | 214 ---- apps/cloudflared/workload.json | 8 - apps/dd-agent/workload.json.tmpl | 25 - apps/dd-management/workload.json.tmpl | 28 - apps/hello-world/workload.json | 7 - apps/mount-data/workload.json | 7 - apps/nv/workload.json | 7 - apps/podman-bootstrap/workload.json | 7 - apps/podman-static/workload.json | 7 - apps/ttyd/workload.json | 11 - apps/web-nvidia-smi/workload.json | 8 - crates/dd-agent/Cargo.toml | 18 + crates/dd-agent/README.md | 28 + crates/dd-agent/src/main.rs | 793 +++++++++++++ docs/rewrite-plan.md | 217 ++++ docs/spec-v2.md | 221 ++++ docs/threat-model-v2.md | 181 +++ index.html | 261 ----- src/agent.rs | 969 ---------------- src/cf.rs | 873 -------------- src/collector.rs | 365 ------ src/config.rs | 367 ------ src/cp.rs | 1258 --------------------- src/devices.rs | 298 ----- src/ee.rs | 119 -- src/error.rs | 54 - src/gh_oidc.rs | 605 ---------- src/html.rs | 73 -- src/ita.rs | 253 ----- src/lib.rs | 15 - src/main.rs | 30 - src/metrics.rs | 206 ---- src/noise_gateway/allowlist.rs | 90 -- src/noise_gateway/attest.rs | 159 --- src/noise_gateway/mod.rs | 52 - src/noise_gateway/noise.rs | 215 ---- src/noise_gateway/upstream.rs | 227 ---- src/stonith.rs | 124 -- src/taint.rs | 119 -- style.css | 146 --- 64 files changed, 1882 insertions(+), 10703 deletions(-) create mode 100644 .github/actions/assign/action.yml delete mode 100644 .github/actions/dd-deploy/README.md delete mode 100644 .github/actions/dd-deploy/action.yml create mode 100644 .github/actions/deploy/action.yml delete mode 100644 .github/actions/relaunch-agent/action.yml delete mode 100644 .github/actions/relaunch-cp/action.yml create mode 100644 .github/actions/verify/action.yml create mode 100644 .github/workflows/ci.yml delete mode 100644 .github/workflows/cleanup.yml delete mode 100644 .github/workflows/deploy-cp.yml delete mode 100644 .github/workflows/release.yml delete mode 100644 .github/workflows/set-agent-owner.yml delete mode 100644 .github/workflows/website-preview.yml delete mode 100644 .nojekyll delete mode 100644 CNAME delete mode 100644 Dockerfile delete mode 100644 apps/README.md delete mode 100755 apps/_infra/dd-relaunch-cp.sh delete mode 100755 apps/_infra/dd-relaunch.sh delete mode 100755 apps/_infra/ee-sync.sh delete mode 100755 apps/_infra/local-agents.sh delete mode 100755 apps/_infra/local-cp.sh delete mode 100644 apps/cloudflared/workload.json delete mode 100644 apps/dd-agent/workload.json.tmpl delete mode 100644 apps/dd-management/workload.json.tmpl delete mode 100644 apps/hello-world/workload.json delete mode 100644 apps/mount-data/workload.json delete mode 100644 apps/nv/workload.json delete mode 100644 apps/podman-bootstrap/workload.json delete mode 100644 apps/podman-static/workload.json delete mode 100644 apps/ttyd/workload.json delete mode 100644 apps/web-nvidia-smi/workload.json create mode 100644 crates/dd-agent/Cargo.toml create mode 100644 crates/dd-agent/README.md create mode 100644 crates/dd-agent/src/main.rs create mode 100644 docs/rewrite-plan.md create mode 100644 docs/spec-v2.md create mode 100644 docs/threat-model-v2.md delete mode 100644 index.html delete mode 100644 src/agent.rs delete mode 100644 src/cf.rs delete mode 100644 src/collector.rs delete mode 100644 src/config.rs delete mode 100644 src/cp.rs delete mode 100644 src/devices.rs delete mode 100644 src/ee.rs delete mode 100644 src/error.rs delete mode 100644 src/gh_oidc.rs delete mode 100644 src/html.rs delete mode 100644 src/ita.rs delete mode 100644 src/lib.rs delete mode 100644 src/main.rs delete mode 100644 src/metrics.rs delete mode 100644 src/noise_gateway/allowlist.rs delete mode 100644 src/noise_gateway/attest.rs delete mode 100644 src/noise_gateway/mod.rs delete mode 100644 src/noise_gateway/noise.rs delete mode 100644 src/noise_gateway/upstream.rs delete mode 100644 src/stonith.rs delete mode 100644 src/taint.rs delete mode 100644 style.css diff --git a/.github/actions/assign/action.yml b/.github/actions/assign/action.yml new file mode 100644 index 0000000..7536396 --- /dev/null +++ b/.github/actions/assign/action.yml @@ -0,0 +1,67 @@ +name: Assign DD v2 agent owner +description: >- + Mint a GitHub Actions OIDC token for the assignment authority and + idempotently assign a DD v2 agent to a GitHub principal. + +inputs: + agent-url: + description: 'Agent base URL, e.g. https://agent.example.com' + required: true + owner-kind: + description: 'Principal kind: user, org, or repo' + required: true + owner-name: + description: 'GitHub login or owner/repo path' + required: true + owner-id: + description: 'Numeric GitHub user/org/repo id' + required: true + claim-id: + description: 'External lease/claim id; safe to reuse for idempotent reconciliation' + required: false + default: '' + audience: + description: 'OIDC audience expected by the agent' + required: false + default: dd-agent + +outputs: + changed: + description: 'Whether the assignment changed runtime state' + value: ${{ steps.assign.outputs.changed }} + +runs: + using: composite + steps: + - name: Assign owner + id: assign + shell: bash + env: + AGENT_URL: ${{ inputs.agent-url }} + OWNER_KIND: ${{ inputs.owner-kind }} + OWNER_NAME: ${{ inputs.owner-name }} + OWNER_ID: ${{ inputs.owner-id }} + CLAIM_ID: ${{ inputs.claim-id }} + AUDIENCE: ${{ inputs.audience }} + run: | + set -euo pipefail + if [ -z "${ACTIONS_ID_TOKEN_REQUEST_TOKEN:-}" ]; then + echo "::error::id-token unavailable; set 'permissions: id-token: write'" + exit 1 + fi + oidc=$(curl -fsSL \ + -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${AUDIENCE}" | jq -r .value) + body=$(jq -n \ + --arg kind "$OWNER_KIND" \ + --arg name "$OWNER_NAME" \ + --argjson id "$OWNER_ID" \ + --arg claim "$CLAIM_ID" \ + '{owner:{kind:$kind,name:$name,id:$id},claim_id:$claim}') + resp=$(curl -fsSL \ + -X POST "${AGENT_URL%/}/owner" \ + -H "Authorization: Bearer ${oidc}" \ + -H "Content-Type: application/json" \ + -d "$body") + echo "$resp" | jq . + echo "changed=$(echo "$resp" | jq -r .changed)" >> "$GITHUB_OUTPUT" diff --git a/.github/actions/dd-deploy/README.md b/.github/actions/dd-deploy/README.md deleted file mode 100644 index 8e936e7..0000000 --- a/.github/actions/dd-deploy/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# dd-deploy - -GitHub Action that deploys a workload JSON to a DD agent using a **per-job GitHub Actions OIDC token**. No shared secrets to configure — the agent verifies the token against GitHub's JWKS and checks that `repository_owner == DD_OWNER`. - -## Usage - -```yaml -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - id-token: write # required to mint the OIDC token - contents: read - steps: - - uses: actions/checkout@v4 - - uses: devopsdefender/dd/.github/actions/dd-deploy@main - with: - cp-url: https://app.devopsdefender.com - vm-name: dd-local-prod - workload: apps/myapp/workload.json -``` - -The action will: - -1. Look up the current hostname for your target agent via the CP's `/api/agents`. -2. Mint a GitHub Actions OIDC JWT with `audience: dd-agent`. -3. POST the baked workload JSON to `https:///deploy` with `Authorization: Bearer `. -4. Poll the agent's `/health` until the deployment appears (or fail after `wait-for-deployment-seconds`). - -## Inputs - -| name | required | default | description | -| --- | --- | --- | --- | -| `cp-url` | yes | — | Control-plane URL (e.g. `https://app.devopsdefender.com`) | -| `vm-name` | yes | — | Target agent `vm_name` as reported on `/api/agents` | -| `workload` | yes | — | Path to the workload JSON spec | -| `audience` | no | `dd-agent` | OIDC audience the agent expects | -| `wait-for-deployment-seconds` | no | `120` | Poll `/health` until `app_name` appears. `0` disables waiting. | - -## Outputs - -| name | description | -| --- | --- | -| `agent-host` | Hostname the workload landed on | -| `app-name` | `app_name` parsed from the workload JSON | - -## Trust model - -The agent's `/deploy` endpoint is CF-Access-bypassed and gated entirely by an in-code OIDC check: - -- Issuer must be `https://token.actions.githubusercontent.com` -- Signature must verify against GitHub's live JWKS -- `repository_owner` claim must equal `DD_OWNER` (set on the CP at boot) -- `audience` claim must match the agent's configured audience - -This means **any workflow in the DD GitHub organization can deploy, with no credentials stored anywhere**. Workflows in a different org (including forks) fail the `repository_owner` check and get 401. - -## Example: deploy on PR merge - -```yaml -name: deploy-myapp - -on: - push: - branches: [main] - -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@v4 - - uses: devopsdefender/dd/.github/actions/dd-deploy@main - with: - cp-url: https://app.devopsdefender.com - vm-name: dd-local-prod - workload: apps/myapp/workload.json - wait-for-deployment-seconds: 300 -``` diff --git a/.github/actions/dd-deploy/action.yml b/.github/actions/dd-deploy/action.yml deleted file mode 100644 index 09339b9..0000000 --- a/.github/actions/dd-deploy/action.yml +++ /dev/null @@ -1,131 +0,0 @@ -name: Deploy workload to a DD agent -description: >- - Mint a per-job GitHub Actions OIDC token, discover the target agent's - current hostname from the DD control-plane's /api/agents, and POST - the workload JSON to the agent's /deploy endpoint. The agent verifies - the OIDC token (issuer = token.actions.githubusercontent.com, - repository_owner = DD_OWNER, audience match) entirely in-code — no - shared secrets to configure. The caller's job must declare - `permissions: id-token: write`. - -inputs: - cp-url: - description: 'Control-plane URL (e.g. https://app.devopsdefender.com)' - required: true - vm-name: - description: 'Target agent vm_name as reported on /api/agents (e.g. dd-local-prod)' - required: true - workload: - description: 'Path to the workload JSON spec to deploy' - required: true - audience: - description: 'OIDC audience the agent expects — must match gh_oidc::Verifier audience (default: dd-agent)' - required: false - default: dd-agent - wait-for-deployment-seconds: - description: 'Seconds to poll the agent /health waiting for app_name in deployments. 0 = skip wait.' - required: false - default: '120' - -outputs: - agent-host: - description: 'Hostname the workload was deployed to' - value: ${{ steps.deploy.outputs.agent-host }} - app-name: - description: 'app_name parsed from the workload JSON' - value: ${{ steps.deploy.outputs.app-name }} - -runs: - using: composite - steps: - - name: Deploy - id: deploy - shell: bash - env: - CP_URL: ${{ inputs.cp-url }} - VM_NAME: ${{ inputs.vm-name }} - WORKLOAD: ${{ inputs.workload }} - AUDIENCE: ${{ inputs.audience }} - WAIT_SECS: ${{ inputs.wait-for-deployment-seconds }} - run: | - set -euo pipefail - - if [ -z "${ACTIONS_ID_TOKEN_REQUEST_TOKEN:-}" ]; then - echo "::error::id-token not available — the calling job needs 'permissions: id-token: write'" - exit 1 - fi - - # Mint the GH OIDC token first — `/api/agents` now requires - # Authorization (repository_owner must match DD_OWNER), same - # as `/deploy`. Audience matches what the agent verifier - # expects; the CP reuses that. - oidc=$(curl -fsSL \ - -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ - "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${AUDIENCE}" | jq -r .value) - - # Discover the agent hostname on the CP. Curl's native --retry - # absorbs the transient 5xx (notably CF 530 "origin - # unreachable") that reliably hits right after an agent - # relaunch — the edge is momentarily between routes. ~30 s - # of retries, no bash loop. - agent_host=$(curl -fsS \ - --retry 6 --retry-delay 5 --retry-all-errors \ - -H "Authorization: Bearer ${oidc}" \ - "${CP_URL}/api/agents" \ - | jq -r --arg vm "$VM_NAME" ' - [.[] | select(.vm_name == $vm and .status == "healthy")] - | sort_by(.last_seen) | reverse | .[0].hostname // empty') - if [ -z "$agent_host" ]; then - echo "::error::no healthy agent with vm_name=$VM_NAME" - exit 1 - fi - echo "agent-host=$agent_host" >> "$GITHUB_OUTPUT" - - # Compact + inspect the workload spec. - spec=$(jq -c . "$WORKLOAD") - app_name=$(echo "$spec" | jq -r '.app_name // empty') - if [ -z "$app_name" ]; then - echo "::error::$WORKLOAD has no .app_name" - exit 1 - fi - echo "app-name=$app_name" >> "$GITHUB_OUTPUT" - - echo "POST /deploy to $agent_host (app=$app_name, repo=$GITHUB_REPOSITORY)" - # Same CF-edge transient can hit the agent hostname too — - # reuse the same retry envelope. Capture the body so we can - # show what EE said; the agent forwards it verbatim. - body=$(curl -fsS \ - --retry 6 --retry-delay 5 --retry-all-errors \ - -X POST "https://${agent_host}/deploy" \ - -H "Authorization: Bearer ${oidc}" \ - -H "Content-Type: application/json" \ - -d "$spec") - echo "agent response: $body" - - if [ "$WAIT_SECS" -gt 0 ]; then - deadline=$(( $(date +%s) + WAIT_SECS )) - while [ "$(date +%s)" -lt "$deadline" ]; do - health=$(curl -fsS "https://${agent_host}/health" 2>/dev/null || echo '{}') - if echo "$health" | jq -e --arg app "$app_name" '.deployments // [] | index($app)' >/dev/null; then - echo "$app_name visible on $agent_host" - exit 0 - fi - sleep 5 - done - echo "::error::$app_name never appeared in /health within ${WAIT_SECS}s. Last deployments:" - echo "$health" | jq '.deployments // []' - # Surface the agent's own stdout so we have agent-side - # ground truth without needing SSH. Failing to fetch is - # itself a signal ("agent isn't reachable") so we keep - # exit 1 regardless — but we want the fetch attempt to - # print whatever it can. - echo "--- dd-agent log from ${agent_host} ---" - agent_log=$(curl -fsS -H "Authorization: Bearer ${oidc}" \ - "https://${agent_host}/logs/dd-agent" 2>&1 || true) - if echo "$agent_log" | jq -e '.lines' >/dev/null 2>&1; then - echo "$agent_log" | jq -r '.lines[]?' - else - echo "$agent_log" - fi - exit 1 - fi diff --git a/.github/actions/deploy/action.yml b/.github/actions/deploy/action.yml new file mode 100644 index 0000000..c39aa3e --- /dev/null +++ b/.github/actions/deploy/action.yml @@ -0,0 +1,75 @@ +name: Deploy to DD v2 agent +description: >- + Deploy a workload JSON to a DD v2 agent using the calling repository's + GitHub Actions OIDC identity. The agent accepts only its current owner. + +inputs: + agent-url: + description: 'Agent base URL, e.g. https://agent.example.com' + required: true + workload: + description: 'Path to workload JSON' + required: true + audience: + description: 'OIDC audience expected by the agent' + required: false + default: dd-agent + wait-seconds: + description: 'Seconds to wait for workload to appear in /health; 0 disables wait' + required: false + default: '120' + +outputs: + app-name: + description: 'Deployed app_name' + value: ${{ steps.deploy.outputs.app-name }} + +runs: + using: composite + steps: + - name: Deploy workload + id: deploy + shell: bash + env: + AGENT_URL: ${{ inputs.agent-url }} + WORKLOAD: ${{ inputs.workload }} + AUDIENCE: ${{ inputs.audience }} + WAIT_SECONDS: ${{ inputs.wait-seconds }} + run: | + set -euo pipefail + if [ -z "${ACTIONS_ID_TOKEN_REQUEST_TOKEN:-}" ]; then + echo "::error::id-token unavailable; set 'permissions: id-token: write'" + exit 1 + fi + app=$(jq -r '.app_name // empty' "$WORKLOAD") + if [ -z "$app" ]; then + echo "::error::$WORKLOAD missing .app_name" + exit 1 + fi + echo "app-name=$app" >> "$GITHUB_OUTPUT" + + echo "Preflight proof:" + curl -fsSL "${AGENT_URL%/}/health" | jq '{agent_id, owner, capabilities}' + + oidc=$(curl -fsSL \ + -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ + "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${AUDIENCE}" | jq -r .value) + curl -fsSL \ + -X POST "${AGENT_URL%/}/deploy" \ + -H "Authorization: Bearer ${oidc}" \ + -H "Content-Type: application/json" \ + -d @"$WORKLOAD" | jq . + + if [ "$WAIT_SECONDS" -gt 0 ]; then + deadline=$(( $(date +%s) + WAIT_SECONDS )) + while [ "$(date +%s)" -lt "$deadline" ]; do + proof=$(curl -fsSL "${AGENT_URL%/}/health") + if echo "$proof" | jq -e --arg app "$app" '.workloads[]? | select(.app_name == $app)' >/dev/null; then + echo "$app visible in proof" + exit 0 + fi + sleep 5 + done + echo "::error::$app did not appear in /health within ${WAIT_SECONDS}s" + exit 1 + fi diff --git a/.github/actions/relaunch-agent/action.yml b/.github/actions/relaunch-agent/action.yml deleted file mode 100644 index ce66736..0000000 --- a/.github/actions/relaunch-agent/action.yml +++ /dev/null @@ -1,219 +0,0 @@ -name: Relaunch local TDX agent -description: >- - SSH into the tdx2 host, recreate the matching dd-local-{kind} libvirt - domain against the given CP url (pulling apps/ from the given git ref), - then block until the agent re-registers with the CP. A release is "done" - only when this action succeeds end-to-end. - -inputs: - kind: - description: 'prod | preview — which libvirt domain to relaunch' - required: true - url: - description: 'CP URL the agent should register against (e.g. https://app.devopsdefender.com)' - required: true - ref: - description: 'git ref whose scripts/apps tree dd-relaunch.sh should check out on the host' - required: true - ssh-key: - description: 'Private SSH key for tdx2@host' - required: true - host: - description: 'Public host address of the tdx2 node' - required: true - ita-api-key: - description: 'Intel Trust Authority API key for attestation' - required: true - release-tag: - description: 'devopsdefender release tag the local agent should download (e.g. pr-abc123, latest)' - required: true - ee-channel: - description: 'easyenclave release channel to sync the libvirt base qcow2 from (stable | staging)' - required: false - default: 'staging' - ee-tag: - description: 'Explicit easyenclave release tag override for pre-flight testing a candidate (e.g. image-abc123456). Wins over `ee-channel`.' - required: false - default: '' - stream-console: - description: 'Tail /var/log/ee-local-.log from tdx2 into the CI log for the duration of the register-wait. Surfaces what the agent VM is actually doing when it fails to register.' - required: false - default: 'true' - owner: - description: 'Fleet principal — GitHub login or owner/repo path. Forwarded as EE_OWNER to local-agents.sh on the tdx2 host, which resolves it to id+kind via gh api and bakes all three into agent.env.' - required: true - -runs: - using: composite - steps: - # curl's native --retry handles the CP-healthy wait. On PR pushes - # we race with Release's deploy-preview standing up the pr-N CP; - # /health is CF-bypassed. 60 attempts × 10s = 10 min budget. - - name: Wait for CP to be healthy - shell: bash - env: - URL: ${{ inputs.url }} - run: | - curl -fsS --retry 60 --retry-delay 10 --retry-all-errors \ - --max-time 5 "$URL/health" >/dev/null - echo "CP $URL healthy" - - # appleboy/ssh-action owns key + known_hosts setup; we just hand - # it the script to run on the tdx2 host. dd-relaunch.sh finishes - # in ~10s — the baked config.iso's EE_BOOT_WORKLOADS drives the - # rest of the boot asynchronously. - - name: ssh + relaunch VM - uses: appleboy/ssh-action@v1.2.0 - env: - KIND: ${{ inputs.kind }} - URL: ${{ inputs.url }} - REF: ${{ inputs.ref }} - RELEASE_TAG: ${{ inputs.release-tag }} - DD_ITA_API_KEY: ${{ inputs.ita-api-key }} - DD_EE_CHANNEL: ${{ inputs.ee-channel }} - DD_EE_TAG: ${{ inputs.ee-tag }} - EE_OWNER: ${{ inputs.owner }} - with: - host: ${{ inputs.host }} - username: tdx2 - key: ${{ inputs.ssh-key }} - envs: KIND,URL,REF,RELEASE_TAG,DD_ITA_API_KEY,DD_EE_CHANNEL,DD_EE_TAG,EE_OWNER - script: | - /home/tdx2/src/dd/apps/_infra/dd-relaunch.sh "$KIND" "$URL" "$REF" "$RELEASE_TAG" - - # Block until the freshly-booted agent VM registers with the CP. - # This is the "I can see the local agent deployment worked" - # signal that gates the whole release. - # - # Budget: cold VM boot ~60s + EE pre-fetch of the dd binary - # from GitHub releases (can be 60s+ on a cold cache) + - # cloudflared tunnel ~30s + agent ITA mint (one round-trip to - # Intel) + register with the CP's 6-retry backoff path - # (~105s cumulative if CF edge is still propagating). Worst - # case is ~7 min end-to-end, so 5 min was too tight — preview - # deploys were flaking here even when the agent eventually - # came up. 60 × 10s = 10 min covers it with headroom. - # - # Simultaneously we tail the VM's serial console log from - # /var/log/ee-local-.log on tdx2 so every `eprintln!` - # the agent prints (mint failures, register errors, EE boot - # issues) lands in the CI log in real time. Previously the - # only visibility was "never registered" — now the cause is - # in the same job log as the wait. - # - # On failure we also dump the last /api/agents payload so - # the CP-side view is captured alongside the VM-side view. - - name: Verify agent registered with CP - shell: bash - env: - URL: ${{ inputs.url }} - KIND: ${{ inputs.kind }} - HOST: ${{ inputs.host }} - SSH_KEY: ${{ inputs.ssh-key }} - STREAM_CONSOLE: ${{ inputs.stream-console }} - run: | - set -uo pipefail - vm="dd-local-$KIND" - since=$(date -u +%Y-%m-%dT%H:%M:%SZ) - - # Start the serial-console tail first so the boot messages - # that arrive during the register wait all show up in-line. - # `tail -F` follows rotation; `ssh -T` suppresses the pty. - # Output is prefixed so console lines are distinguishable - # from the wait loop's own status prints. - tail_pid="" - # Mirror of the tailed console, grepped by the register-wait - # loop below for `devopsdefender: fatal:` — the single - # `eprintln!` src/main.rs emits on unrecoverable error. - # If the agent's register retries all fail (e.g. dnsmasq - # negative-cached NXDOMAIN from the CP's hydrate probe), the - # agent exits and will never appear in `/api/agents`; without - # this mirror the CI loop would poll a corpse for its full - # 10-min budget. Always produced even when streaming is off, - # since the grep is what saves the budget. - mirror=$(mktemp) - trap 'rm -f "$mirror"' EXIT - if [ "$STREAM_CONSOLE" = "true" ]; then - key=$(mktemp) - trap 'rm -f "$key" "$mirror"' EXIT - printf '%s\n' "$SSH_KEY" > "$key" - chmod 600 "$key" - # `stdbuf -oL` on both ends: force tail and sed to line-buffer - # so each VM boot line streams into this job's log as it's - # written, rather than block-buffering until the pipe closes. - # Dropping `2>/dev/null` on tail so "cannot open" / "retrying" - # diagnostics surface — if the serial log file never appears, - # we want the reason on-screen, not a silent wait. - ssh -i "$key" -T \ - -o StrictHostKeyChecking=accept-new \ - -o ServerAliveInterval=30 \ - tdx2@"$HOST" \ - "sudo stdbuf -oL tail -n +1 -F /var/log/ee-local-$KIND.log || true" \ - 2>&1 | stdbuf -oL sed -u "s/^/ [$vm console] /" \ - | stdbuf -oL tee -a "$mirror" & - tail_pid=$! - # Give the tail a moment to attach so its first lines land - # before the register wait's first status message. - sleep 1 - fi - cleanup_tail() { - # Kill every child process — the `ssh | sed | tee` pipeline - # plus any stragglers. `kill "$tail_pid"` alone drops tee but - # leaves ssh holding its remote `tail -F`, which can prevent - # the Actions runner from closing the step out (stdout stream - # still has live writers). `pkill -P $$` nukes the whole - # tree; a short SIGKILL chaser covers processes that ignored - # SIGTERM. - pkill -TERM -P $$ 2>/dev/null || true - sleep 0.3 - pkill -KILL -P $$ 2>/dev/null || true - } - - # Mint a GH OIDC token for the /api/agents bearer auth. The - # calling job must grant `id-token: write` in its - # permissions block. - if [ -z "${ACTIONS_ID_TOKEN_REQUEST_TOKEN:-}" ]; then - cleanup_tail - echo "::error::id-token not available — caller needs 'permissions: id-token: write'" - exit 1 - fi - oidc=$(curl -fsSL \ - -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ - "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=dd-agent" | jq -r .value) - last_payload="" - for i in $(seq 1 60); do - payload=$(curl -fsS --max-time 10 -H "Authorization: Bearer ${oidc}" \ - "$URL/api/agents" 2>/dev/null || true) - if [ -n "$payload" ]; then - last_payload="$payload" - fi - host_found=$(echo "$payload" | jq -r --arg since "$since" --arg vm "$vm" ' - [.[] | select(.vm_name==$vm and .status=="healthy" and .last_seen > $since)] - | sort_by(.last_seen) | reverse | .[0].hostname // empty' 2>/dev/null || true) - if [ -n "$host_found" ]; then - echo "$vm registered at https://$host_found" - cleanup_tail - exit 0 - fi - # Fail fast if the agent already exited with a fatal error. - # The console mirror is the streaming tee of the serial log; - # `devopsdefender: fatal:` is main.rs's terminal eprintln - # and EE doesn't restart boot workloads, so once it lands in - # the mirror there's nothing left to wait for. - if [ -s "$mirror" ] && grep -q 'devopsdefender: fatal:' "$mirror"; then - cleanup_tail - echo "::group::Fatal line from $vm console" - grep -m1 'devopsdefender: fatal:' "$mirror" || true - echo "::endgroup::" - echo "::error::$vm agent exited fatal; not polling /api/agents further" - exit 1 - fi - echo " waiting for $vm to register with $URL... (${i}/60)" - sleep 10 - done - cleanup_tail - echo "::group::Last /api/agents payload" - echo "$last_payload" | jq . 2>/dev/null || echo "$last_payload" - echo "::endgroup::" - echo "::error::$vm never registered with $URL within 10 min (since=$since)" - exit 1 diff --git a/.github/actions/relaunch-cp/action.yml b/.github/actions/relaunch-cp/action.yml deleted file mode 100644 index 5b8683c..0000000 --- a/.github/actions/relaunch-cp/action.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: Relaunch local TDX CP -description: >- - SSH into the tdx2 host and recreate the `dd-local-{env}-cp` libvirt - domain for this env (prod / pr-N / staging). Mirrors relaunch-agent, - but boots dd-management instead of dd-agent. Used by deploy-cp.yml's - `target: ssh` branch as a drop-in replacement for GCE VM creation. - -inputs: - env: - description: 'DD_ENV the CP serves (e.g. pr-42, staging)' - required: true - hostname: - description: 'Where the CP claims its CF tunnel (e.g. pr-42.devopsdefender.com)' - required: true - ref: - description: 'Git ref whose apps/ tree the host should check out' - required: true - ssh-key: - description: 'Private SSH key for tdx2@host' - required: true - host: - description: 'Public host address of the tdx2 node' - required: true - cf-api-token: - description: 'Cloudflare API token (tunnel + DNS + Access scopes)' - required: true - cf-account-id: - description: 'Cloudflare account id' - required: true - cf-zone-id: - description: 'Cloudflare zone id for the apex domain' - required: true - admin-email: - description: 'Break-glass email for CF Access human policy' - required: true - ita-api-key: - description: 'Intel Trust Authority API key' - required: true - release-tag: - description: 'devopsdefender release tag (e.g. pr-abc123, latest)' - required: true - ee-channel: - description: 'easyenclave release channel to sync the libvirt base qcow2 from (stable | staging)' - required: false - default: 'staging' - ee-tag: - description: 'Explicit easyenclave release tag override for pre-flight testing a candidate (e.g. image-abc123456). Wins over `ee-channel`.' - required: false - default: '' - owner: - description: 'Fleet principal — GitHub login or owner/repo path. Forwarded as EE_OWNER to local-cp.sh on the tdx2 host, which resolves it to id+kind via gh api and bakes all three into agent.env.' - required: true - -runs: - using: composite - steps: - - name: ssh + relaunch CP VM - uses: appleboy/ssh-action@v1.2.0 - env: - ENV_LABEL: ${{ inputs.env }} - CP_HOSTNAME: ${{ inputs.hostname }} - REF: ${{ inputs.ref }} - RELEASE_TAG: ${{ inputs.release-tag }} - CLOUDFLARE_API_TOKEN: ${{ inputs.cf-api-token }} - CLOUDFLARE_ACCOUNT_ID: ${{ inputs.cf-account-id }} - CLOUDFLARE_ZONE_ID: ${{ inputs.cf-zone-id }} - DD_ACCESS_ADMIN_EMAIL: ${{ inputs.admin-email }} - DD_ITA_API_KEY: ${{ inputs.ita-api-key }} - DD_EE_CHANNEL: ${{ inputs.ee-channel }} - DD_EE_TAG: ${{ inputs.ee-tag }} - EE_OWNER: ${{ inputs.owner }} - with: - host: ${{ inputs.host }} - username: tdx2 - key: ${{ inputs.ssh-key }} - envs: ENV_LABEL,CP_HOSTNAME,REF,RELEASE_TAG,CLOUDFLARE_API_TOKEN,CLOUDFLARE_ACCOUNT_ID,CLOUDFLARE_ZONE_ID,DD_ACCESS_ADMIN_EMAIL,DD_ITA_API_KEY,DD_EE_CHANNEL,DD_EE_TAG,EE_OWNER - script: | - /home/tdx2/src/dd/apps/_infra/dd-relaunch-cp.sh \ - "$ENV_LABEL" "$CP_HOSTNAME" "$REF" "$RELEASE_TAG" diff --git a/.github/actions/verify/action.yml b/.github/actions/verify/action.yml new file mode 100644 index 0000000..c77b4c5 --- /dev/null +++ b/.github/actions/verify/action.yml @@ -0,0 +1,69 @@ +name: Verify DD v2 agent proof +description: Fetch and check a DD v2 agent proof document. + +inputs: + agent-url: + description: 'Agent base URL, e.g. https://agent.example.com' + required: true + owner-kind: + description: 'Expected owner kind. Empty skips owner check.' + required: false + default: '' + owner-name: + description: 'Expected owner name. Empty skips owner check.' + required: false + default: '' + owner-id: + description: 'Expected numeric owner id. Empty skips owner check.' + required: false + default: '' + require-no-exec: + description: 'Fail if capabilities.exec is true' + required: false + default: 'false' + require-no-shell: + description: 'Fail if capabilities.interactive_shell is true' + required: false + default: 'false' + +outputs: + agent-id: + description: 'Agent id from proof' + value: ${{ steps.verify.outputs.agent-id }} + +runs: + using: composite + steps: + - name: Verify proof + id: verify + shell: bash + env: + AGENT_URL: ${{ inputs.agent-url }} + OWNER_KIND: ${{ inputs.owner-kind }} + OWNER_NAME: ${{ inputs.owner-name }} + OWNER_ID: ${{ inputs.owner-id }} + REQUIRE_NO_EXEC: ${{ inputs.require-no-exec }} + REQUIRE_NO_SHELL: ${{ inputs.require-no-shell }} + run: | + set -euo pipefail + proof=$(curl -fsSL "${AGENT_URL%/}/health") + echo "$proof" | jq . + echo "agent-id=$(echo "$proof" | jq -r .agent_id)" >> "$GITHUB_OUTPUT" + + if [ -n "$OWNER_KIND" ] || [ -n "$OWNER_NAME" ] || [ -n "$OWNER_ID" ]; then + if [ -z "$OWNER_KIND" ] || [ -z "$OWNER_NAME" ] || [ -z "$OWNER_ID" ]; then + echo "::error::owner-kind, owner-name, and owner-id must be set together" + exit 1 + fi + echo "$proof" | jq -e \ + --arg kind "$OWNER_KIND" \ + --arg name "$OWNER_NAME" \ + --argjson id "$OWNER_ID" \ + '.owner.kind == $kind and .owner.name == $name and .owner.id == $id' >/dev/null + fi + if [ "$REQUIRE_NO_EXEC" = "true" ]; then + echo "$proof" | jq -e '.capabilities.exec == false' >/dev/null + fi + if [ "$REQUIRE_NO_SHELL" = "true" ]; then + echo "$proof" | jq -e '.capabilities.interactive_shell == false' >/dev/null + fi diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2fb73bc --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,25 @@ +name: CI + +on: + pull_request: + push: + branches: [main] + +permissions: + contents: read + +jobs: + rust: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt + - uses: Swatinem/rust-cache@v2 + - name: cargo fmt + run: cargo fmt --all -- --check + - name: cargo check + run: cargo check --workspace + - name: cargo test + run: cargo test --workspace diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml deleted file mode 100644 index 69312e5..0000000 --- a/.github/workflows/cleanup.yml +++ /dev/null @@ -1,353 +0,0 @@ -name: Cleanup - -# Unified teardown for per-PR preview envs, plus orphan CF state. -# -# One `teardown_env` routine, used by both trigger classes: -# -# - `on: delete` (branch-delete) → tear down exactly that PR's env. -# Replaces the old dedicated `pr-teardown.yml`; fast path. -# -# - `schedule` + `workflow_dispatch` + `workflow_run: Release` → -# sweep every closed/merged PR env that still has lingering CF -# state or VMs, then run an orphan-agent hostname pass. Slow -# path; safety net for anything the fast path missed (branch -# kept open post-close, CF/libvirt/GCE error during teardown, -# agent-tunnel UUIDs the per-env prefix doesn't catch). -# -# Per-env teardown covers, in order: -# 1. `dd-local-{env}-cp` libvirt domain + qcow2 + config iso -# + serial log on tdx2 (via SSH — target=ssh CPs). -# 2. CF Access apps named `dd-{env}-…`. -# 3. CF tunnels named `dd-{env}-…` (connections drained first — -# delete is 400 otherwise). -# 4. CF DNS CNAMEs matching `{env}.`, `{env}-`, or `dd-{env}-`. -# 5. GCE instances labeled `dd_env={env}` (legacy target=gcp; -# ssh-target deploys don't produce these). -# -# A separate `reap-terminated-gce` job still exists to sweep -# TERMINATED GCE VMs for *open* envs (production; historic staging -# TERMINATED state). That's orthogonal to per-env teardown and -# keyed on status rather than PR lifecycle. - -on: - delete: - workflow_dispatch: - workflow_run: - workflows: ["Release"] - types: [completed] - schedule: - - cron: '0 */6 * * *' - -concurrency: - group: dd-cleanup - cancel-in-progress: false - -permissions: - contents: read - -env: - GCP_ZONE: us-central1-c - -jobs: - teardown-pr-envs: - # Ignore tag-deletes; they never have an associated env. - if: github.event_name != 'delete' || github.event.ref_type == 'branch' - runs-on: ubuntu-latest - environment: staging - permissions: - contents: read - id-token: write - pull-requests: read - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - CF_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CF_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CF_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - GH_TOKEN: ${{ github.token }} - DD_LOCAL_HOST: ${{ secrets.DD_LOCAL_HOST }} - DD_LOCAL_SSH_KEY: ${{ secrets.DD_LOCAL_SSH_KEY }} - EVENT_NAME: ${{ github.event_name }} - DELETED_REF: ${{ github.event.ref }} - REPO: ${{ github.repository }} - steps: - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' - - uses: google-github-actions/setup-gcloud@v2 - - - name: Cleanup - run: | - set -euo pipefail - AUTH=(-H "Authorization: Bearer $CF_API_TOKEN") - - # SSH key for libvirt teardown on tdx2. - ssh_key=$(mktemp) - trap 'rm -f "$ssh_key"' EXIT - printf '%s\n' "$DD_LOCAL_SSH_KEY" > "$ssh_key" - chmod 600 "$ssh_key" - - # Fetch CF state once; reused across every env in the - # loop below and by the orphan-agent sweep at the end. - # per_page=200 for tunnels so a predecessor doesn't sort - # off page 1 on a busy account (same cap as src/cf.rs). - echo "Fetching CF state..." - apps=$(curl -fsS "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/access/apps?per_page=1000") - tunnels=$(curl -fsS "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel?is_deleted=false&per_page=200") - dns=$(curl -fsS "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records?per_page=500&type=CNAME") - - # Per-env teardown. Best-effort per resource so a transient - # failure on one doesn't abort the rest of the sweep. - teardown_env() { - local env="$1" - echo "== $env ==" - - # 1. libvirt CP domain on tdx2. `|| true` around the - # whole ssh so a transient tdx2-unreachable doesn't - # kill the CF work that follows; next run catches - # anything missed. - ssh -i "$ssh_key" -T \ - -o StrictHostKeyChecking=accept-new \ - -o ServerAliveInterval=30 \ - -o ConnectTimeout=15 \ - tdx2@"$DD_LOCAL_HOST" bash -s -- "$env" <<'EOSSH' || true - set -euo pipefail - env="$1" - vm="dd-local-$env-cp" - if virsh dominfo "$vm" >/dev/null 2>&1; then - virsh destroy "$vm" 2>/dev/null || true - virsh undefine "$vm" --managed-save --snapshots-metadata 2>/dev/null || true - echo " - libvirt $vm" - fi - sudo rm -f \ - "/var/lib/libvirt/images/$vm.qcow2" \ - "/var/lib/libvirt/images/$vm-config.iso" \ - "/var/log/ee-local-$env-cp.log" \ - 2>/dev/null || true - EOSSH - - # 2. CF Access apps named dd-{env}-… - echo "$apps" | jq -r --arg p "dd-$env-" \ - '.result[] | select(.name | startswith($p)) | "\(.id) \(.name)"' \ - | while read -r id name; do - [ -z "$id" ] && continue - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/access/apps/$id" \ - >/dev/null 2>&1 && echo " - app $name" || echo " ! app $name FAILED" - done - - # 3. CF tunnels named dd-{env}-… (drain connections - # first — tunnel delete 400s otherwise). - echo "$tunnels" | jq -r --arg p "dd-$env-" \ - '.result[] | select(.name | startswith($p)) | "\(.id) \(.name)"' \ - | while read -r id name; do - [ -z "$id" ] && continue - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$id/connections" \ - >/dev/null 2>&1 || true - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/cfd_tunnel/$id" \ - >/dev/null 2>&1 && echo " - tun $name" || echo " ! tun $name FAILED" - done - - # 4. DNS CNAMEs: - # {env}. CP hostname (pr-42.devopsdefender.com) - # {env}- CP workload label (pr-42-gpu.*) - # dd-{env}- agent tunnels (dd-pr-42-agent-*.) - echo "$dns" | jq -r --arg dot "$env." --arg dash "$env-" --arg dd "dd-$env-" \ - '.result[] | select((.name | startswith($dot)) or (.name | startswith($dash)) or (.name | startswith($dd))) | "\(.id) \(.name)"' \ - | while read -r id name; do - [ -z "$id" ] && continue - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records/$id" \ - >/dev/null 2>&1 && echo " - dns $name" || echo " ! dns $name FAILED" - done - - # 5. GCE instances (legacy target=gcp path). - local env_vms - env_vms=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=$env" \ - --format='value(name)') - if [ -n "$env_vms" ]; then - # shellcheck disable=SC2086 - gcloud compute instances delete $env_vms \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo " - gce $(echo "$env_vms" | tr '\n' ' ')" - fi - } - - # Pick which envs to tear down based on trigger. - envs="" - if [ "$EVENT_NAME" = "delete" ]; then - # Fast path: resolve the deleted branch to its PR number. - # `--state all` so the usual post-merge branch-delete - # flow resolves. Newest wins if the branch was ever - # reused across multiple PRs. - PR=$(gh pr list --repo "$REPO" \ - --head "$DELETED_REF" --state all --limit 1 \ - --json number --jq '.[0].number // empty') - if [ -n "$PR" ]; then - echo "Branch-delete $DELETED_REF → PR #$PR → env pr-$PR" - envs="pr-$PR" - else - echo "No PR ever existed for branch $DELETED_REF; nothing to tear down." - fi - else - # Sweep path: every pr-N named anywhere in CF state or - # among RUNNING GCE VMs whose PR is closed/merged. - vm_envs=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter='labels.devopsdefender=managed AND labels.dd_env~"^pr-" AND status=RUNNING' \ - --format='value(labels.dd_env)' | sort -u) - # `grep -oE` returns 1 on no match, which would trip - # pipefail the moment the account is clean (happy state); - # swallow that specific case. - all_envs=$( { - echo "$apps" | jq -r '.result[].name // empty' - echo "$tunnels" | jq -r '.result[].name // empty' - echo "$dns" | jq -r '.result[].name // empty' - echo "$vm_envs" - } | grep -oE 'pr-[0-9]+' | sort -u || true ) - for env in $all_envs; do - pr="${env#pr-}" - state=$(gh pr view "$pr" --repo "$REPO" \ - --json state --jq .state 2>/dev/null || echo UNKNOWN) - case "$state" in - OPEN) echo "== $env — PR #$pr OPEN, skipping ==" ;; - UNKNOWN) echo "::warning::pr-$pr state unknown; leaving alone" ;; - *) envs="$envs $env" ;; - esac - done - fi - - if [ -z "$envs" ]; then - echo "No envs to tear down." - else - for env in $envs; do - teardown_env "$env" - done - fi - - # Orphan agent-hostname sweep — scheduled runs only. Agent - # tunnels are `dd-{env}-agent-{uuid}`; every agent relaunch - # mints a fresh UUID, STONITH kills the old tunnel, but the - # old tunnel's Access apps and DNS records linger. The - # per-env pass above matches by `dd-{env}-` prefix so it - # catches these for CLOSED envs; this catch-up pass covers - # them for OPEN envs (where the env-level sweep intentionally - # skips). Branch-delete doesn't need this — that path's - # env-level sweep already cleared the env's orphans. - if [ "$EVENT_NAME" != "delete" ]; then - echo "" - echo "Orphan agent-hostname sweep..." - - live_tunnel_names=$(echo "$tunnels" | jq -r '.result[].name' \ - | grep -E '^dd-[a-zA-Z0-9-]+-agent-' | sort -u) - live_hostnames=$(for t in $live_tunnel_names; do - echo "$t.$DD_DOMAIN" - done | sort -u) - - orphan_apps=$(echo "$apps" | jq -r \ - '.result[] | select(.name | test("^dd-[a-zA-Z0-9-]+-agent-dd-[a-zA-Z0-9-]+-agent-")) | "\(.id) \(.name)"' \ - | while read -r id name; do - host=$(echo "$name" | grep -oE 'dd-[a-zA-Z0-9-]+-agent-[a-f0-9-]+\.devopsdefender\.com' | head -1) - [ -z "$host" ] && continue - if ! echo "$live_hostnames" | grep -qx "$host"; then - echo "$id $name" - fi - done) - - orphan_dns=$(echo "$dns" | jq -r \ - '.result[] | select(.name | test("dd-[a-zA-Z0-9-]+-agent-[a-f0-9-]+\\.devopsdefender\\.com")) | "\(.id) \(.name)"' \ - | while read -r id name; do - host=$(echo "$name" | grep -oE 'dd-[a-zA-Z0-9-]+-agent-[a-f0-9-]+\.devopsdefender\.com' | head -1) - [ -z "$host" ] && continue - if ! echo "$live_hostnames" | grep -qx "$host"; then - echo "$id $name" - fi - done) - - oa=$(printf '%s\n' "$orphan_apps" | grep -c . || true) - od=$(printf '%s\n' "$orphan_dns" | grep -c . || true) - echo "Orphan-agent: apps=$oa dns=$od" - - if [ -n "$orphan_apps" ]; then - echo "$orphan_apps" | while read -r id name; do - [ -z "$id" ] && continue - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/accounts/$CF_ACCOUNT_ID/access/apps/$id" \ - >/dev/null 2>&1 && echo " - app $name" || echo " ! app $name FAILED" - done - fi - if [ -n "$orphan_dns" ]; then - echo "$orphan_dns" | while read -r id name; do - [ -z "$id" ] && continue - curl -fsS -X DELETE "${AUTH[@]}" \ - "https://api.cloudflare.com/client/v4/zones/$CF_ZONE_ID/dns_records/$id" \ - >/dev/null 2>&1 && echo " - dns $name" || echo " ! dns $name FAILED" - done - fi - fi - - # Explicit success — prior `[ -n "$x" ] && cmd` short-circuit - # would leave `$?=1` when both orphan lists were empty (the - # happy state), and `set -e` turned that into a script failure - # with no visible error. Trailing no-op restores exit 0. - echo "cleanup done." - - # TERMINATED GCE VMs accumulate for *open* envs (STONITH moves a - # prior VM to TERMINATED; deploy-cp.yml's verify step force-deletes - # zombies, but only on target=gcp). Keyed on status, not PR state, - # so it's distinct from `teardown-pr-envs` above. Skipped on - # branch-delete events — nothing env-scoped to the deleted branch - # lives here on target=ssh. - reap-terminated-gce: - if: github.event_name != 'delete' - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - include: - - env: staging - wip: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - sa: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' - filter: 'labels.devopsdefender=managed AND labels.dd_env~"^pr-" AND status=TERMINATED' - label: 'dd-pr-*' - - env: production - wip: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - sa: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' - filter: 'labels.devopsdefender=managed AND labels.dd_env=production AND status=TERMINATED' - label: 'dd-production' - environment: ${{ matrix.env }} - permissions: - contents: read - id-token: write - steps: - - uses: google-github-actions/auth@v2 - with: - workload_identity_provider: ${{ matrix.wip }} - service_account: ${{ matrix.sa }} - - uses: google-github-actions/setup-gcloud@v2 - - name: Reap TERMINATED ${{ matrix.label }} VMs - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - FILTER: ${{ matrix.filter }} - LABEL: ${{ matrix.label }} - run: | - DEAD=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="$FILTER" \ - --format="value(name)") - if [ -z "$DEAD" ]; then - echo "No TERMINATED $LABEL VMs to reap." - exit 0 - fi - echo "Reaping: $(echo "$DEAD" | tr '\n' ' ')" - # shellcheck disable=SC2086 - gcloud compute instances delete $DEAD \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet diff --git a/.github/workflows/deploy-cp.yml b/.github/workflows/deploy-cp.yml deleted file mode 100644 index f292373..0000000 --- a/.github/workflows/deploy-cp.yml +++ /dev/null @@ -1,613 +0,0 @@ -name: Deploy CP - -# Reusable workflow: provision the CP TDX VM on GCP, wait for it to be -# healthy, verify attestation + dashboard + STONITH, then cascade a -# relaunch of the matching dd-local agent VM and block until it -# re-registers. Called from release.yml's deploy-preview (PR path) and -# deploy-production (main / dispatch path) with env-specific inputs — -# both paths share this exact set of verification steps so every PR -# exercises the prod deploy code. -# -# GitHub Actions allows ≤4 levels of workflow_call nesting. Today's -# chain is `release.yml → deploy-cp.yml` (2). The agent-relaunch -# cascade uses a composite action (same-job, no nesting) to keep -# headroom for future wrapping. - -on: - workflow_call: - inputs: - env: - description: 'DD_ENV (e.g. "production", "pr-42")' - required: true - type: string - hostname: - description: 'Public hostname (e.g. app.devopsdefender.com)' - required: true - type: string - gcp_environment: - description: 'GitHub environment name — "production" | "staging"' - required: true - type: string - workload_identity_provider: - description: 'GCP Workload Identity Federation provider resource name (required only for target=gcp)' - required: false - type: string - default: '' - service_account: - description: 'GCP service account email (required only for target=gcp)' - required: false - type: string - default: '' - release_tag: - description: 'devopsdefender release tag to deploy (e.g. "latest", "pr-abc123")' - required: true - type: string - comment_on_pr: - description: 'Leave a PR comment with the preview URL' - required: false - type: boolean - default: false - relaunch_agent: - description: 'After CP deploy, cascade a relaunch of dd-local-{env} via SSH' - required: false - type: boolean - default: true - ref: - description: 'Git ref the tdx2 host should pull before relaunching the agent VM' - required: false - type: string - default: main - target: - description: 'Where the CP runs: "ssh" (libvirt VM on the tdx2 baremetal, default) or "gcp" (fresh TDX VM on GCE)' - required: false - type: string - default: ssh - ee_tag: - description: 'Explicit easyenclave release tag to pin this deploy to (pre-flight-test a candidate before promoting staging→stable). Empty = default to channel resolved from env.' - required: false - type: string - default: '' - owner: - description: 'Fleet principal — GitHub login or owner/repo path. Baked into the agent.env as EE_OWNER and resolved to id+kind via gh api. No default; the caller (release.yml) sets it explicitly.' - required: true - type: string - -# Serialize PR-preview deploys on the `preview` bucket: all pr-* -# runs share one libvirt domain (`dd-local-preview`) on tdx2, so -# two concurrent deploys race — the second PR's dd-relaunch.sh -# destroys and rebuilds the VM mid-way through the first PR's -# register-wait, so the first PR's agent ends up registering -# against the WRONG cp (whichever env won the rebuild) and the -# wait times out. Keeping production in its own group so a -# prod deploy isn't blocked by a preview queue. -concurrency: - group: deploy-cp-${{ inputs.env == 'production' && 'production' || 'preview' }} - cancel-in-progress: false - -jobs: - deploy: - runs-on: ubuntu-latest - environment: ${{ inputs.gcp_environment }} - permissions: - contents: read - id-token: write - pull-requests: write - env: - DD_ENV: ${{ inputs.env }} - DD_HOSTNAME: ${{ inputs.hostname }} - GCP_ZONE: us-central1-c - steps: - - uses: actions/checkout@v4 - - # Map dd env → easyenclave channel. Prod (CP and local) tracks - # `stable` (EE v* tags); everything else (pr-N previews) tracks - # `staging` (EE prereleases from main). Keeps dd prod off - # in-flight-main EE builds while preview deploys catch EE - # regressions early. Applies to both target=gcp (image family - # lookup) and target=ssh (release-asset sync on tdx2). - - name: Resolve easyenclave channel - id: ee - env: - ENV: ${{ inputs.env }} - run: | - case "$ENV" in - production) channel=stable; family=easyenclave-stable ;; - *) channel=staging; family=easyenclave-staging ;; - esac - { - echo "channel=$channel" - echo "family=$family" - } >> "$GITHUB_OUTPUT" - - # GCP auth only on target=gcp. We used to configure it - # unconditionally so ssh-target deploys could reap GCE orphans - # from a prior gcp-target deploy, but the resulting ~20s - # overhead per deploy outweighed the benefit — the orphan path - # is only reachable during a one-time target=gcp→ssh - # transition, and `.github/workflows/force-cleanup-tunnels.yml` - # (and GCP's own console) are better homes for that cleanup. - - uses: google-github-actions/auth@v2 - if: inputs.target == 'gcp' - with: - workload_identity_provider: ${{ inputs.workload_identity_provider }} - service_account: ${{ inputs.service_account }} - - uses: google-github-actions/setup-gcloud@v2 - if: inputs.target == 'gcp' - - - name: Relaunch SSH CP VM (target=ssh) - if: inputs.target == 'ssh' - uses: ./.github/actions/relaunch-cp - with: - env: ${{ inputs.env }} - hostname: ${{ inputs.hostname }} - ref: ${{ inputs.ref }} - ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} - host: ${{ secrets.DD_LOCAL_HOST }} - cf-api-token: ${{ secrets.DD_CP_CF_API_TOKEN }} - cf-account-id: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - cf-zone-id: ${{ secrets.DD_CP_CF_ZONE_ID }} - admin-email: ${{ vars.DD_ACCESS_ADMIN_EMAIL || secrets.DD_ACCESS_ADMIN_EMAIL }} - ita-api-key: ${{ secrets.DD_ITA_API_KEY }} - release-tag: ${{ inputs.release_tag }} - ee-channel: ${{ steps.ee.outputs.channel }} - ee-tag: ${{ inputs.ee_tag }} - owner: ${{ inputs.owner }} - - - name: Create TDX VM (boots from easyenclave, fetches dd from GitHub releases) - if: inputs.target == 'gcp' - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - DD_DOMAIN: ${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - CLOUDFLARE_API_TOKEN: ${{ secrets.DD_CP_CF_API_TOKEN }} - CLOUDFLARE_ACCOUNT_ID: ${{ secrets.DD_CP_CF_ACCOUNT_ID }} - CLOUDFLARE_ZONE_ID: ${{ secrets.DD_CP_CF_ZONE_ID }} - DD_ACCESS_ADMIN_EMAIL: ${{ vars.DD_ACCESS_ADMIN_EMAIL || secrets.DD_ACCESS_ADMIN_EMAIL }} - DD_ITA_API_KEY: ${{ secrets.DD_ITA_API_KEY }} - DD_RELEASE_TAG: ${{ inputs.release_tag }} - EE_OWNER: ${{ inputs.owner }} - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - EE_IMAGE_FAMILY: ${{ steps.ee.outputs.family }} - EE_IMAGE_PROJECT: easyenclave - VM_MACHINE_TYPE: c3-standard-4 - VM_DISK_SIZE: 10GB - DD_ITA_BASE_URL: https://api.trustauthority.intel.com - DD_ITA_JWKS_URL: https://portal.trustauthority.intel.com/certs - DD_ITA_ISSUER: https://portal.trustauthority.intel.com - run: | - set -euo pipefail - - VM_NAME="dd-${DD_ENV}-$(date +%s)" - : "${DD_ITA_API_KEY:?set DD_ITA_API_KEY via secrets.DD_ITA_API_KEY}" - : "${DD_ACCESS_ADMIN_EMAIL:?set DD_ACCESS_ADMIN_EMAIL via vars or secrets.DD_ACCESS_ADMIN_EMAIL}" - : "${EE_OWNER:?set inputs.owner from the calling workflow}" - - # Resolve EE_OWNER → numeric id + kind via gh api. Same idiom - # as apps/_infra/local-{agents,cp}.sh. Hard-fails on lookup - # miss so a typo doesn't bake a bricked principal into the VM. - if [[ "$EE_OWNER" == */* ]]; then - EE_OWNER_ID=$(gh api "repos/$EE_OWNER" -q .id) - EE_OWNER_KIND=repo - else - read -r EE_OWNER_ID GH_TYPE < <(gh api "users/$EE_OWNER" -q '"\(.id) \(.type)"') - case "$GH_TYPE" in - User) EE_OWNER_KIND=user ;; - Organization) EE_OWNER_KIND=org ;; - *) echo "::error::unexpected gh api type: $GH_TYPE"; exit 1 ;; - esac - fi - export DD_OWNER="$EE_OWNER" DD_OWNER_ID DD_OWNER_KIND - DD_OWNER_ID="$EE_OWNER_ID" - DD_OWNER_KIND="$EE_OWNER_KIND" - echo " EE_OWNER=$EE_OWNER (kind=$EE_OWNER_KIND, id=$EE_OWNER_ID)" - - # Bake a workload template: substitute ${VAR} placeholders - # and strip "KEY=" env entries that ended up with empty values - # (e.g. OAuth creds in non-prod envs). envsubst is restricted - # to the uppercase ${VAR} refs the template actually declares - # so shell locals inside cmd strings ($i, $((…)), etc.) - # aren't eaten. - bake() { - case "$1" in - *.json.tmpl) - local vars - vars=$(grep -oE '\$\{[A-Z_][A-Z0-9_]*\}' "$1" | sort -u | tr -d '\n') - envsubst "$vars" < "$1" \ - | jq -c 'if .env then .env |= map(select(test("^[^=]+=.+"))) else . end' - ;; - *.json) - jq -c . "$1" - ;; - *) - echo "::error::unknown workload file type: $1" >&2 - return 1 - ;; - esac - } - - # Boot workloads come from apps//workload.{json,json.tmpl}. - # cloudflared fetches the binary onto PATH; dd-management runs - # devopsdefender in DD_MODE=management (CP + dashboard). - EE_BOOT_WORKLOADS=$({ - bake apps/cloudflared/workload.json - bake apps/dd-management/workload.json.tmpl - bake apps/ttyd/workload.json - bake apps/ee-proxy/workload.json.tmpl - } | jq -cs '.') - - # EE_CAPTURE_SOCKET tells EE (post-capture-socket patch) to tee - # every spawned workload's stdio to this unix socket. Kept for - # forward compatibility — a future workload can bind + read it. - # Unpatched EE images ignore the variable; patched EE falls back - # to running without capture when nothing is listening. - jq -c -n \ - --arg workloads "$EE_BOOT_WORKLOADS" \ - --arg owner "$EE_OWNER" \ - --argjson owner_id "$EE_OWNER_ID" \ - --arg owner_kind "$EE_OWNER_KIND" \ - '{ - "EE_BOOT_WORKLOADS": $workloads, - "EE_OWNER": $owner, - "EE_OWNER_ID": ($owner_id | tostring), - "EE_OWNER_KIND": $owner_kind, - "EE_CAPTURE_SOCKET": "/run/ee/capture.sock" - }' \ - > /tmp/ee-config.json - - gcloud compute instances create "$VM_NAME" \ - --project="$GCP_PROJECT_ID" \ - --zone="$GCP_ZONE" \ - --machine-type="$VM_MACHINE_TYPE" \ - --confidential-compute-type=TDX \ - --maintenance-policy=TERMINATE \ - --boot-disk-size="$VM_DISK_SIZE" \ - --image-family="$EE_IMAGE_FAMILY" \ - --image-project="$EE_IMAGE_PROJECT" \ - --metadata-from-file=ee-config=/tmp/ee-config.json \ - --labels=devopsdefender=managed,dd_env="${DD_ENV}" \ - --tags=dd-management - - rm -f /tmp/ee-config.json - echo "VM: $VM_NAME ($DD_HOSTNAME, release $DD_RELEASE_TAG)" - - - name: Wait for CP health (streams GCE serial console) - if: inputs.target == 'gcp' - env: - AGENT_URL: https://${{ inputs.hostname }} - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - run: | - VM_NAME=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - if [ -z "$VM_NAME" ]; then - echo "::error::no dd-${DD_ENV} VM found — gcp-deploy.sh must have failed" - exit 1 - fi - echo "Watching VM: $VM_NAME (zone: $GCP_ZONE)" - - LAST_LINES=0 - for i in $(seq 1 60); do - # Stream serial console so boot failures (DHCP hang, release - # fetch error, cloudflared exit, etc.) are visible without - # shelling into GCP. - gcloud compute instances get-serial-port-output "$VM_NAME" \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" 2>/dev/null \ - > /tmp/serial.log || true - TOTAL_LINES=$(wc -l < /tmp/serial.log) - if [ "$TOTAL_LINES" -gt "$LAST_LINES" ]; then - tail -n +$((LAST_LINES + 1)) /tmp/serial.log \ - | sed 's/^/[serial] /' - LAST_LINES=$TOTAL_LINES - fi - - if grep -qE "FATAL|Kernel panic|Invalid ELF header|/bin/sh: can't access tty" /tmp/serial.log; then - echo "::error::boot failed — serial log shows fatal pattern" - exit 1 - fi - - if curl -fsS "${AGENT_URL}/health" >/dev/null 2>&1; then - echo "Agent healthy at ${AGENT_URL}" - exit 0 - fi - echo " waiting for tunnel... (${i}/60)" - sleep 5 - done - echo "::error::Agent not healthy within 5 minutes" - echo "--- final serial tail ---" - tail -80 /tmp/serial.log | sed 's/^/[serial] /' - exit 1 - - - name: Wait for CP health (SSH CP — streams serial console) - if: inputs.target == 'ssh' - env: - AGENT_URL: https://${{ inputs.hostname }} - ENV_LABEL: ${{ inputs.env }} - HOST: ${{ secrets.DD_LOCAL_HOST }} - SSH_KEY: ${{ secrets.DD_LOCAL_SSH_KEY }} - run: | - set -uo pipefail - # SSH CP boot fires asynchronously via `virsh start` on the - # tdx2 host. We own the outside view (poll /health for a - # real 200) and tail the inside view — the serial console - # at /var/log/ee-local-${env}-cp.log — so every boot-time - # eprintln! lands in this job's log in real time. Both views - # together turn "CP not healthy within 5 min" from an opaque - # timeout into a traceable failure. - tail_pid="" - key=$(mktemp) - mirror=$(mktemp) - # `rc=$?` FIRST: capture the script's actual exit status so - # the `exit $rc` at the end doesn't inherit `pkill`'s exit 1 - # (which fires whenever it has no matches — i.e. a clean - # state). Without that, every successful run would be - # reported as failed. - # - # `pkill -P $$` kills the ENTIRE tail pipeline (ssh, sed, - # tee) — not just `$tail_pid`, which is tee (the last - # command). Without this the ssh hangs on the remote - # `tail -F`, stdout stays open on the runner, and the step - # can stall long past the script's `exit` call. - trap 'rc=$?; rm -f "$key" "$mirror"; pkill -TERM -P $$ 2>/dev/null || true; sleep 0.3; pkill -KILL -P $$ 2>/dev/null || true; exit $rc' EXIT - printf '%s\n' "$SSH_KEY" > "$key" - chmod 600 "$key" - # `stdbuf -oL` on both ends: force tail and sed to line-buffer - # so each VM boot line lands in this job's log as it's written, - # instead of getting block-buffered until the pipe closes (which - # happens long after boot is over). Dropping `2>/dev/null` on - # tail so "cannot open log" / "retrying" diagnostics surface - # — if the log file never appears, we want to see why rather - # than stare at an apparently-silent boot. - # - # `tee -a "$mirror"` so the health-poll loop can fail fast - # when the CP's main.rs emits its terminal - # `devopsdefender: fatal:` eprintln. Without this the loop - # would burn its full 5-min budget polling /health on a - # tunnel that never came up. - ssh -i "$key" -T \ - -o StrictHostKeyChecking=accept-new \ - -o ServerAliveInterval=30 \ - tdx2@"$HOST" \ - "sudo stdbuf -oL tail -n +1 -F /var/log/ee-local-${ENV_LABEL}-cp.log || true" \ - 2>&1 | stdbuf -oL sed -u "s/^/ [dd-local-${ENV_LABEL}-cp console] /" \ - | stdbuf -oL tee -a "$mirror" & - tail_pid=$! - sleep 1 - for i in $(seq 1 60); do - CODE=$(curl -s -o /dev/null -w '%{http_code}' "${AGENT_URL}/health" || echo "000") - if [ "$CODE" = "200" ]; then - echo "CP healthy at ${AGENT_URL} (HTTP 200)" - exit 0 - fi - if [ -s "$mirror" ] && grep -q 'devopsdefender: fatal:' "$mirror"; then - echo "::group::Fatal line from CP console" - grep -m1 'devopsdefender: fatal:' "$mirror" || true - echo "::endgroup::" - echo "::error::CP exited fatal; not polling /health further" - exit 1 - fi - echo " waiting for tunnel (got HTTP ${CODE})... (${i}/60)" - sleep 5 - done - echo "::error::CP not healthy within 5 minutes (SSH target)" - exit 1 - - - name: Verify NEW VM via TDX attestation - env: - AGENT_URL: https://${{ inputs.hostname }} - run: | - # The Noise pre-handshake bundle used to live at `/attest`; - # it's now served inline on `/health` as `.noise.quote_b64` - # + `.noise.pubkey_hex` so a bastion-app can bootstrap in - # one fetch. `quote_b64` is an Intel-signed TDX quote whose - # `report_data` (first 32 bytes) equals the raw Noise static - # pubkey. MRTD = 48 bytes at offset 184 in TDX quote v4; - # non-zero means attestation worked. - for attempt in $(seq 1 60); do - BODY=$(curl -s -w '\n%{http_code}' \ - "${AGENT_URL}/health" || echo $'\n000') - CODE=$(echo "$BODY" | tail -n1) - JSON=$(echo "$BODY" | sed '$d') - if [ "$CODE" = "200" ]; then - QUOTE_B64=$(echo "$JSON" | jq -r '.noise.quote_b64 // empty') - if [ -n "$QUOTE_B64" ] && [ "$QUOTE_B64" != "null" ]; then - MRTD=$(echo "$QUOTE_B64" | base64 -d \ - | dd bs=1 skip=184 count=48 status=none | xxd -p -c 48) - if [ -n "$MRTD" ] && [ "$MRTD" != "$(printf '00%.0s' {1..48})" ]; then - PUBKEY=$(echo "$JSON" | jq -r '.noise.pubkey_hex // empty') - echo "NEW VM verified — MRTD: $MRTD, noise_pubkey: $PUBKEY" - exit 0 - fi - echo " /health .noise 200 but MRTD empty/zero, retrying... (${attempt}/60)" - else - echo " /health 200 but no .noise.quote_b64, retrying... (${attempt}/60)" - fi - else - echo " /health returned HTTP ${CODE}, retrying... (${attempt}/60)" - fi - sleep 10 - done - echo "::error::/health never returned a valid .noise.quote_b64 — stale tunnel or new VM never came up" - exit 1 - - - name: Verify dashboard is fronted by CF Access - env: - AGENT_URL: https://${{ inputs.hostname }} - run: | - # The root is now behind a CF Access self-hosted app. An - # unauthenticated browserless curl should see either: - # - 302 with Location: *.cloudflareaccess.com (login redirect) - # - 401 (API-style reject) - # A 200 means the app layer is serving the dashboard without - # CF gating, which we want to catch. - set +e - for attempt in $(seq 1 12); do - out=$(curl -s -o /dev/null -w '%{http_code}|%{redirect_url}' "${AGENT_URL}/") - code=${out%%|*} - location=${out#*|} - if [[ "$code" =~ ^30[12]$ ]] && [[ "$location" == *cloudflareaccess.com* ]]; then - echo "Dashboard gated by CF Access (HTTP ${code} → ${location}, attempt ${attempt})" - exit 0 - fi - if [ "$code" = "401" ]; then - echo "Dashboard gated by CF Access (HTTP 401, attempt ${attempt})" - exit 0 - fi - echo " dashboard returned HTTP ${code} (location=${location:-none}), retrying... (${attempt}/12)" - sleep 5 - done - echo "::error::dashboard / never returned a CF Access gate response" - exit 1 - - - name: Comment preview URL on PR - if: inputs.comment_on_pr && github.event_name == 'pull_request' - uses: actions/github-script@v7 - with: - script: | - const url = `https://${{ inputs.hostname }}`; - const body = [ - `### DD preview ready`, - ``, - `**URL:** ${url}`, - ``, - `Browser login: visit ${url} — Cloudflare Access routes you`, - `through GitHub OAuth. Membership (public) in the DD GitHub`, - `org grants access; the \`DD_ACCESS_ADMIN_EMAIL\` is the`, - `break-glass fallback.`, - ``, - `Machine-to-machine: GitHub Actions workflows in the`, - `DD_OWNER org pass their per-job OIDC JWT as`, - `\`Authorization: Bearer …\` (audience \`dd-agent\`).`, - ``, - `Register endpoint for a local agent: \`${url}/register\``, - `(CF-Access-bypassed; authenticated by ITA attestation).`, - ].join('\n'); - const { data: comments } = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - }); - const marker = '### DD preview ready'; - const existing = comments.find(c => c.user.type === 'Bot' && c.body && c.body.includes(marker)); - if (existing) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: existing.id, - body, - }); - } else { - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body, - }); - } - - # Cascade a relaunch of the matching dd-local-{env} libvirt domain - # on the tdx2 host, then block on it registering with the freshly- - # deployed CP. This is the gate: a release is "done" only when the - # local agent is back online talking to the new CP. - - name: Relaunch dd-local-${{ inputs.env == 'production' && 'prod' || 'preview' }} - if: inputs.relaunch_agent - uses: ./.github/actions/relaunch-agent - with: - kind: ${{ inputs.env == 'production' && 'prod' || 'preview' }} - url: https://${{ inputs.hostname }} - ref: ${{ inputs.ref }} - ssh-key: ${{ secrets.DD_LOCAL_SSH_KEY }} - host: ${{ secrets.DD_LOCAL_HOST }} - ita-api-key: ${{ secrets.DD_ITA_API_KEY }} - release-tag: ${{ inputs.release_tag }} - ee-channel: ${{ steps.ee.outputs.channel }} - ee-tag: ${{ inputs.ee_tag }} - owner: ${{ inputs.owner }} - - # Preview only: deploy hello-world as the end-to-end canary. - # Exercises the full GH-OIDC auth path (mint → agent verify - # against GitHub JWKS → repository_owner check) AND proves - # podman bootstrapped correctly on the agent. Prod uses - # web-nvidia-smi (below) for the same verification, so running - # hello-world there too is just noise. - # - # `continue-on-error` while an outstanding bug is investigated: - # `POST /deploy` consistently returns 2xx with empty body from - # GitHub Actions runners, even though direct curls to the same - # endpoint succeed. Neither dd-agent's /deploy diagnostic nor - # EE's handle_deploy diagnostic fire, so the request is dying - # in the CF/cloudflared/axum-extractor layer before any Rust - # code we own sees it. Unblocks unrelated PRs until fixed. - - name: Deploy hello-world via GH OIDC - if: inputs.relaunch_agent && inputs.env != 'production' - continue-on-error: true - uses: ./.github/actions/dd-deploy - with: - cp-url: https://${{ inputs.hostname }} - vm-name: dd-local-preview - workload: apps/hello-world/workload.json - - # Prod only: redeploy the nvidia-smi demo every main-push so - # `-gpu.devopsdefender.com` stays live and pinned to - # the freshly-booted prod agent. Preview agents have no GPU, so - # the workload's podman run with /dev/nvidia* devices would fail. - # `continue-on-error`: same bug as above. - - name: Deploy web-nvidia-smi via GH OIDC - if: inputs.relaunch_agent && inputs.env == 'production' - continue-on-error: true - uses: ./.github/actions/dd-deploy - with: - cp-url: https://${{ inputs.hostname }} - vm-name: dd-local-prod - workload: apps/web-nvidia-smi/workload.json - # nvidia-smi's container needs a `podman pull` + apt-get - # install of netcat in the first run — give it headroom - # past the default 120s so the /health poll catches it. - wait-for-deployment-seconds: '300' - - # Runs last so the relaunch cascade's own STONITH wave (old agent - # re-registers → old CP's CF tunnel is deleted → old CP poweroffs) - # is captured by this verification, in addition to the kill that - # happens when the new CP first registers its own tunnel. Also - # keeps the slowest + flakiest verify (24×5s loop + fallback - # force-delete) behind the user-facing outputs (PR comment, - # relaunched local agent). - # - # target=ssh skips this step entirely — the CP runs on tdx2 - # libvirt, there's no GCE instance to verify. Orphans from a - # historical gcp-target deploy are cleaned up via the separate - # `force-cleanup-tunnels.yml` workflow. - - name: Verify STONITH halted prior VM(s) in this env - if: inputs.target == 'gcp' - env: - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - run: | - # dd-register STONITHs the old VM on startup by deleting its - # CF tunnel → old cloudflared exits → old dd-register poweroffs. - # Scoped to this env — per-PR previews are hostname-isolated, - # so this only reaps prior deploys of the same env. - NEW_VM=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV}" \ - --format="value(name)" --sort-by=~creationTimestamp | head -1) - echo "new VM: $NEW_VM" - SURVIVORS="" - for i in $(seq 1 24); do - SURVIVORS=$(gcloud compute instances list \ - --project="$GCP_PROJECT_ID" \ - --filter="labels.devopsdefender=managed AND labels.dd_env=${DD_ENV} AND status=RUNNING" \ - --format="value(name)" \ - | grep -vx "$NEW_VM" || true) - if [ -z "$SURVIVORS" ]; then - echo "STONITH verified — only $NEW_VM running in ${DD_ENV}" - exit 0 - fi - echo " still running besides $NEW_VM: $(echo "$SURVIVORS" | tr '\n' ' ')" - echo " waiting for STONITH poweroff... (${i}/24)" - sleep 5 - done - echo "::warning::STONITH-by-tunnel-delete timed out; force-deleting zombies:" - echo "$SURVIVORS" - # shellcheck disable=SC2086 - gcloud compute instances delete $SURVIVORS \ - --project="$GCP_PROJECT_ID" --zone="$GCP_ZONE" --quiet || true - echo "zombies reaped; $NEW_VM is the only ${DD_ENV} VM" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index cbc86b6..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,208 +0,0 @@ -name: Release - -# One workflow to rule them all: build the static musl binary, publish -# it as a GitHub release asset, and deploy it to either the PR preview -# (per-PR ephemeral CP at pr-N.domain) or production (app.domain). Both -# paths cascade into a relaunch of the matching dd-local agent VM on -# the tdx2 host, and the Release run only goes green when that agent -# re-registers with the freshly-deployed CP. -# -# Paths: -# pull_request → build → deploy-preview → dd-local-preview relaunch -# push main → build → deploy-production → dd-local-prod relaunch -# push v* → build only (versioned release, no deploy) -# workflow_dispatch → build → deploy-production (rollback tool; -# release_tag input picks which tag to deploy) - -on: - push: - branches: [main] - tags: ['v*'] - paths-ignore: - - "README.md" - pull_request: - paths-ignore: - - "README.md" - workflow_dispatch: - inputs: - release_tag: - description: 'Release tag to deploy to production (rollback tool; default: latest)' - required: false - default: 'latest' - ee_tag: - description: 'Pin the libvirt base qcow2 to a specific easyenclave release (e.g. v0.3.1 or image-abc123456) for pre-flight testing a candidate EE. Empty = workflow default (prod + preview both pinned to the same v-tag in their with: blocks below).' - required: false - default: '' - -concurrency: - group: dd-release-${{ github.ref }} - # PR pushes cancel old runs. Main / tag / manual dispatch queue — - # we never want to cancel an in-progress prod deploy. - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -permissions: - contents: write - # GitHub release attestations (actions/attest-build-provenance). - # We sign every published `devopsdefender` binary so the CP can later - # verify a registering agent's artifact came from this workflow. - id-token: write - attestations: write - -jobs: - build: - runs-on: ubuntu-latest - outputs: - tag: ${{ steps.meta.outputs.tag }} - sha12: ${{ steps.meta.outputs.sha12 }} - steps: - - uses: actions/checkout@v4 - - - name: Install musl tools - run: sudo apt-get update && sudo apt-get install -y --no-install-recommends musl-tools - - - uses: dtolnay/rust-toolchain@stable - with: - targets: x86_64-unknown-linux-musl - components: rustfmt, clippy - - - uses: Swatinem/rust-cache@v2 - - # Folded in from the old ci.yml so we don't duplicate the compile - # pass. clippy + test run against the musl target we're about to - # build the release binary for — same toolchain, shared sccache, - # one compile of the dep graph for the whole job. - - name: cargo fmt - run: cargo fmt --all -- --check - - - name: cargo clippy - run: cargo clippy --workspace --all-targets --target x86_64-unknown-linux-musl -- -D warnings - - - name: cargo test - run: cargo test --workspace --target x86_64-unknown-linux-musl - - - name: Build static binaries - run: | - cargo build --release -p devopsdefender --target x86_64-unknown-linux-musl - - - name: Resolve release tag - id: meta - run: | - SHA12=$(echo "${{ github.sha }}" | cut -c1-12) - if [[ "${GITHUB_REF}" == refs/tags/v* ]]; then - TAG="${GITHUB_REF#refs/tags/}" - PRERELEASE="" - elif [[ "${{ github.event_name }}" == "pull_request" ]]; then - TAG="pr-${SHA12}" - PRERELEASE="--prerelease" - else - TAG="latest" - PRERELEASE="" - fi - echo "tag=${TAG}" >> "$GITHUB_OUTPUT" - echo "prerelease=${PRERELEASE}" >> "$GITHUB_OUTPUT" - echo "sha12=${SHA12}" >> "$GITHUB_OUTPUT" - - # Attest the binary's SHA256 against this workflow's identity - # (GitHub's Sigstore-backed signing OIDC'd as - # `https://github.com/devopsdefender/dd/.github/workflows/release.yml@`). - # The attestation is stored on the repo's /attestations endpoint - # and retrievable via `gh attestation verify` or the REST API. - # Skipped on fork PRs (they lack id-token). - - name: Attest devopsdefender binary - if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository - uses: actions/attest-build-provenance@v2 - with: - subject-path: target/x86_64-unknown-linux-musl/release/devopsdefender - subject-name: devopsdefender - - - name: Publish release - env: - GH_TOKEN: ${{ github.token }} - TAG: ${{ steps.meta.outputs.tag }} - PRERELEASE: ${{ steps.meta.outputs.prerelease }} - run: | - DD_BINARY=target/x86_64-unknown-linux-musl/release/devopsdefender - # Rolling tags (latest, pr-*) get recreated each run. Versioned - # v* tags are immutable. - if [[ "$TAG" == "latest" || "$TAG" == pr-* ]]; then - gh release delete "$TAG" --yes --cleanup-tag 2>/dev/null || true - fi - gh release create "$TAG" $PRERELEASE \ - --title "$TAG" \ - --notes "Built from ${GITHUB_SHA}" \ - "$DD_BINARY" - - - name: Rotate old PR pre-releases - if: github.event_name == 'pull_request' - env: - GH_TOKEN: ${{ github.token }} - run: | - gh release list --limit 200 --json tagName,isPrerelease,createdAt \ - | jq -r '.[] | select(.isPrerelease and (.tagName | startswith("pr-"))) | .tagName' \ - | tail -n +12 \ - | xargs -rI{} gh release delete {} --yes --cleanup-tag - - # Per-PR ephemeral preview at pr-{N}.{domain}. Browser auth is CF - # Access GitHub OAuth (public org membership) across all envs; there - # is no longer a preview-vs-prod auth difference. Cascades into - # dd-local-preview relaunch. - deploy-preview: - if: github.event_name == 'pull_request' - needs: build - permissions: - contents: read - id-token: write - pull-requests: write - uses: ./.github/workflows/deploy-cp.yml - with: - env: pr-${{ github.event.number }} - hostname: pr-${{ github.event.number }}.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - gcp_environment: staging - workload_identity_provider: 'projects/654815109728/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-staging-ci@eestaging.iam.gserviceaccount.com' - release_tag: ${{ needs.build.outputs.tag }} - comment_on_pr: true - ref: ${{ github.event.pull_request.head.ref }} - # Preview used to track the `staging` channel (newest image-* - # prerelease on EE main) for early regression signal, but an - # unreleasable EE main snapshot blocked every dd PR's CI until - # EE shipped a fix. Pin preview to the same tag as prod — one - # bump, one PR — so dd development isn't coupled to EE main's - # release health. Bump in lockstep with the prod pin below. - ee_tag: ${{ inputs.ee_tag || 'v0.3.1' }} - owner: devopsdefender - secrets: inherit - - # Production deploy at app.{domain}. Fires on push-to-main OR on a - # manual workflow_dispatch (rollback to a specific release_tag). - # Tag pushes (v*) intentionally do not auto-deploy — they just - # publish the artifact. Cascades into dd-local-prod relaunch. - deploy-production: - if: >- - (github.event_name == 'push' && github.ref == 'refs/heads/main') - || github.event_name == 'workflow_dispatch' - needs: build - permissions: - contents: read - id-token: write - # Granted (though unused — comment_on_pr=false here) so the - # permissions intersection with deploy-cp.yml's job matches. - pull-requests: write - uses: ./.github/workflows/deploy-cp.yml - with: - env: production - hostname: app.${{ vars.DD_CF_DOMAIN || 'devopsdefender.com' }} - gcp_environment: production - workload_identity_provider: 'projects/779946350556/locations/global/workloadIdentityPools/github-actions-pool/providers/github-provider' - service_account: 'easyenclave-production-ci@easyenclave.iam.gserviceaccount.com' - release_tag: ${{ inputs.release_tag || 'latest' }} - comment_on_pr: false - ref: main - # Production EE base image pin. Bump this (single-line PR) when - # we want prod to move to a newer easyenclave release. Static - # pinning is the whole point — prod should never silently pick - # up whatever EE shipped most recently. Preview still tracks - # the staging channel (see deploy-preview block above). - ee_tag: ${{ inputs.ee_tag || 'v0.3.1' }} - owner: devopsdefender - secrets: inherit diff --git a/.github/workflows/set-agent-owner.yml b/.github/workflows/set-agent-owner.yml deleted file mode 100644 index cd8f0b2..0000000 --- a/.github/workflows/set-agent-owner.yml +++ /dev/null @@ -1,100 +0,0 @@ -name: Set agent owner - -# One-shot operator tool: POST /owner on a DD agent to set its -# `agent_owner` (a principal — GitHub user, org, or specific repo). -# The /owner endpoint is fleet-gated — only an OIDC token whose -# claims match the agent's baked-in fleet principal (DD_OWNER / -# DD_OWNER_ID / DD_OWNER_KIND, set at provision time, currently -# devopsdefender for the dd-managed fleet) is accepted. So this -# workflow lives in the dd repo, not on the tenant's repo. -# -# Use cases: -# - Pinning dd-local-bot to `satsforcompute` so satsforcompute's CI -# can /deploy the bot binary. -# - Re-applying after a node reboot (agent_owner is runtime-only). -# - Clearing (set agent-owner to empty string). - -on: - workflow_dispatch: - inputs: - cp-url: - description: 'Control-plane URL (e.g. https://app.devopsdefender.com)' - required: true - default: https://app.devopsdefender.com - vm-name: - description: 'Target agent vm_name (e.g. dd-local-bot)' - required: true - agent-owner: - description: 'GitHub login (user/org) or owner/repo path to set as agent_owner. Empty string to clear.' - required: true - claim-id: - description: 'Optional claim_id for the audit log (free-form).' - required: false - default: '' - -permissions: - contents: read - id-token: write # required to mint the fleet OIDC token - -jobs: - set-owner: - runs-on: ubuntu-latest - steps: - - name: Resolve agent host + POST /owner - env: - CP_URL: ${{ inputs.cp-url }} - VM_NAME: ${{ inputs.vm-name }} - AGENT_OWNER: ${{ inputs.agent-owner }} - CLAIM_ID: ${{ inputs.claim-id }} - AUDIENCE: dd-agent - run: | - set -euo pipefail - # Mint an OIDC token whose repository_owner = devopsdefender; - # the agent's `require_fleet_oidc` accepts only this issuer. - TOKEN=$(curl -fsSL \ - "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${AUDIENCE}" \ - -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ - | jq -r .value) - - AGENT_HOST=$(curl -fsSL \ - -H "Authorization: Bearer ${TOKEN}" \ - "${CP_URL}/api/agents" \ - | jq -r --arg vm "$VM_NAME" \ - '[.[] | select(.vm_name==$vm and .status=="healthy")][0].hostname') - if [ -z "$AGENT_HOST" ] || [ "$AGENT_HOST" = "null" ]; then - echo "::error::no healthy agent with vm_name=$VM_NAME" - exit 1 - fi - - # Resolve agent_owner to (id, kind) via gh api unless we're - # clearing. Slash → repo; otherwise look up users/ and - # branch on type. Hard-fail on lookup miss — better than - # baking a typo into runtime state. - if [ -z "$AGENT_OWNER" ]; then - OWNER_ID=0 - OWNER_KIND="" - elif [[ "$AGENT_OWNER" == */* ]]; then - OWNER_ID=$(gh api "repos/$AGENT_OWNER" -q .id) - OWNER_KIND=repo - else - read -r OWNER_ID GH_TYPE < <(gh api "users/$AGENT_OWNER" -q '"\(.id) \(.type)"') - case "$GH_TYPE" in - User) OWNER_KIND=user ;; - Organization) OWNER_KIND=org ;; - *) echo "::error::unexpected gh api type: $GH_TYPE"; exit 1 ;; - esac - fi - echo "POST https://${AGENT_HOST}/owner agent_owner=${AGENT_OWNER:-} kind=${OWNER_KIND:-} id=${OWNER_ID}" - BODY=$(jq -n \ - --arg owner "$AGENT_OWNER" \ - --argjson id "$OWNER_ID" \ - --arg kind "$OWNER_KIND" \ - --arg claim "$CLAIM_ID" \ - '{agent_owner: $owner, agent_owner_id: $id, agent_owner_kind: $kind, claim_id: $claim}') - - RESP=$(curl -fsSL \ - -X POST "https://${AGENT_HOST}/owner" \ - -H "Authorization: Bearer ${TOKEN}" \ - -H "Content-Type: application/json" \ - -d "$BODY") - echo "agent: $RESP" diff --git a/.github/workflows/website-preview.yml b/.github/workflows/website-preview.yml deleted file mode 100644 index af60b35..0000000 --- a/.github/workflows/website-preview.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Website Preview - -on: - pull_request: - types: [opened, reopened, synchronize, closed] - -permissions: - contents: write - pull-requests: write - -concurrency: preview-${{ github.ref }} - -jobs: - preview: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: rossjrw/pr-preview-action@v1 - with: - source-dir: . - preview-branch: gh-pages - umbrella-dir: pr-preview - action: auto diff --git a/.nojekyll b/.nojekyll deleted file mode 100644 index e69de29..0000000 diff --git a/CNAME b/CNAME deleted file mode 100644 index cc0b9fb..0000000 --- a/CNAME +++ /dev/null @@ -1 +0,0 @@ -devopsdefender.com diff --git a/Cargo.lock b/Cargo.lock index f15047f..900cd61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,41 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "aead" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" -dependencies = [ - "crypto-common", - "generic-array", -] - -[[package]] -name = "aes" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - -[[package]] -name = "aes-gcm" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "831010a0f742e1209b3bcea8fab6a8e149051ba6099432c8cb2cc117dec3ead1" -dependencies = [ - "aead", - "aes", - "cipher", - "ctr", - "ghash", - "subtle", -] - [[package]] name = "android_system_properties" version = "0.1.5" @@ -66,13 +31,11 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "axum" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +checksum = "31b698c5f9a010f6573133b09e0de5408834d0c82f8d7475a89fc1867a71cd90" dependencies = [ "axum-core", - "axum-macros", - "base64", "bytes", "form_urlencoded", "futures-util", @@ -91,10 +54,8 @@ dependencies = [ "serde_json", "serde_path_to_error", "serde_urlencoded", - "sha1", "sync_wrapper", "tokio", - "tokio-tungstenite", "tower", "tower-layer", "tower-service", @@ -120,17 +81,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "axum-macros" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aa268c23bfbbd2c4363b9cd302a4f504fb2a9dfe7e3451d66f35dd392e20aca" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "base64" version = "0.22.1" @@ -139,27 +89,9 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" [[package]] name = "bitflags" -version = "2.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" - -[[package]] -name = "blake2" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" -dependencies = [ - "digest", -] - -[[package]] -name = "block-buffer" -version = "0.10.4" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" [[package]] name = "bumpalo" @@ -175,9 +107,9 @@ checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "cc" -version = "1.2.57" +version = "1.2.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" dependencies = [ "find-msvc-tools", "shlex", @@ -195,30 +127,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" -[[package]] -name = "chacha20" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - -[[package]] -name = "chacha20poly1305" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" -dependencies = [ - "aead", - "chacha20", - "cipher", - "poly1305", - "zeroize", -] - [[package]] name = "chrono" version = "0.4.44" @@ -233,17 +141,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", - "zeroize", -] - [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -251,110 +148,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" -dependencies = [ - "generic-array", - "rand_core 0.6.4", - "typenum", -] - -[[package]] -name = "ctr" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" -dependencies = [ - "cipher", -] - -[[package]] -name = "curve25519-dalek" -version = "4.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" -dependencies = [ - "cfg-if", - "cpufeatures", - "curve25519-dalek-derive", - "fiat-crypto", - "rustc_version", - "subtle", - "zeroize", -] - -[[package]] -name = "curve25519-dalek-derive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "data-encoding" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" - -[[package]] -name = "deranged" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" -dependencies = [ - "powerfmt", -] - -[[package]] -name = "devopsdefender" +name = "dd-agent" version = "0.1.0" dependencies = [ "anyhow", "axum", - "base64", "chrono", - "futures-util", - "hex", "jsonwebtoken", - "libc", - "rand 0.8.5", "reqwest", "serde", "serde_json", - "snow", - "sysinfo", - "tempfile", "thiserror", "tokio", - "urlencoding", "uuid", - "x25519-dalek", ] [[package]] -name = "digest" -version = "0.10.7" +name = "deranged" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ - "block-buffer", - "crypto-common", - "subtle", + "powerfmt", ] [[package]] @@ -384,18 +199,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "fastrand" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" - -[[package]] -name = "fiat-crypto" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" - [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -432,23 +235,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" -[[package]] -name = "futures-macro" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" - [[package]] name = "futures-task" version = "0.3.32" @@ -462,23 +248,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-core", - "futures-macro", - "futures-sink", "futures-task", "pin-project-lite", "slab", ] -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "getrandom" version = "0.2.17" @@ -519,16 +293,6 @@ dependencies = [ "wasip3", ] -[[package]] -name = "ghash" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0d8a4362ccb29cb0b265253fb0a2728f592895ee6854fd9bc13f2ffda266ff1" -dependencies = [ - "opaque-debug", - "polyval", -] - [[package]] name = "hashbrown" version = "0.15.5" @@ -540,9 +304,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" [[package]] name = "heck" @@ -550,12 +314,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - [[package]] name = "http" version = "1.4.0" @@ -603,9 +361,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hyper" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", @@ -617,7 +375,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -625,15 +382,14 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.7" +version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ "http", "hyper", "hyper-util", "rustls", - "rustls-pki-types", "tokio", "tokio-rustls", "tower-service", @@ -675,7 +431,7 @@ dependencies = [ "js-sys", "log", "wasm-bindgen", - "windows-core 0.62.2", + "windows-core", ] [[package]] @@ -689,12 +445,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -702,9 +459,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -715,9 +472,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -729,15 +486,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -749,15 +506,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -787,9 +544,9 @@ dependencies = [ [[package]] name = "idna_adapter" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" dependencies = [ "icu_normalizer", "icu_properties", @@ -797,25 +554,16 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown 0.16.1", + "hashbrown 0.17.0", "serde", "serde_core", ] -[[package]] -name = "inout" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" -dependencies = [ - "generic-array", -] - [[package]] name = "ipnet" version = "2.12.0" @@ -824,9 +572,9 @@ checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" dependencies = [ "memchr", "serde", @@ -840,10 +588,12 @@ checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] @@ -871,21 +621,15 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.183" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" - -[[package]] -name = "linux-raw-sys" -version = "0.12.1" +version = "0.2.186" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "log" @@ -919,24 +663,15 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", "windows-sys 0.61.2", ] -[[package]] -name = "ntapi" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" -dependencies = [ - "winapi", -] - [[package]] name = "num-bigint" version = "0.4.6" @@ -949,9 +684,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-integer" @@ -977,12 +712,6 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" -[[package]] -name = "opaque-debug" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" - [[package]] name = "pem" version = "3.0.6" @@ -1005,40 +734,11 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "poly1305" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" -dependencies = [ - "cpufeatures", - "opaque-debug", - "universal-hash", -] - -[[package]] -name = "polyval" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" -dependencies = [ - "cfg-if", - "cpufeatures", - "opaque-debug", - "universal-hash", -] - [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -1106,7 +806,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand", "ring", "rustc-hash", "rustls", @@ -1155,33 +855,12 @@ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.4", -] - -[[package]] -name = "rand" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" -dependencies = [ - "rand_chacha 0.9.0", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "ppv-lite86", - "rand_core 0.6.4", + "rand_chacha", + "rand_core", ] [[package]] @@ -1191,16 +870,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core 0.9.5", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.17", + "rand_core", ] [[package]] @@ -1266,37 +936,15 @@ dependencies = [ [[package]] name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "rustix" -version = "1.1.4" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.61.2", -] +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" dependencies = [ "once_cell", "ring", @@ -1308,9 +956,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.14.0" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" dependencies = [ "web-time", "zeroize", @@ -1318,9 +966,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "ring", "rustls-pki-types", @@ -1341,9 +989,9 @@ checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "serde" @@ -1411,28 +1059,6 @@ dependencies = [ "serde", ] -[[package]] -name = "sha1" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "shlex" version = "1.3.0" @@ -1473,22 +1099,6 @@ version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" -[[package]] -name = "snow" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "850948bee068e713b8ab860fe1adc4d109676ab4c3b621fd8147f06b261f2f85" -dependencies = [ - "aes-gcm", - "blake2", - "chacha20poly1305", - "curve25519-dalek", - "rand_core 0.6.4", - "rustc_version", - "sha2", - "subtle", -] - [[package]] name = "socket2" version = "0.6.3" @@ -1542,32 +1152,6 @@ dependencies = [ "syn", ] -[[package]] -name = "sysinfo" -version = "0.33.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fc858248ea01b66f19d8e8a6d55f41deaf91e9d495246fd01368d99935c6c01" -dependencies = [ - "core-foundation-sys", - "libc", - "memchr", - "ntapi", - "windows", -] - -[[package]] -name = "tempfile" -version = "3.27.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" -dependencies = [ - "fastrand", - "getrandom 0.4.2", - "once_cell", - "rustix", - "windows-sys 0.61.2", -] - [[package]] name = "thiserror" version = "2.0.18" @@ -1621,9 +1205,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -1646,9 +1230,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -1662,9 +1246,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -1681,18 +1265,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-tungstenite" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d25a406cddcc431a75d3d9afc6a7c0f7428d4891dd973e4d54c56b46127bf857" -dependencies = [ - "futures-util", - "log", - "tokio", - "tungstenite", -] - [[package]] name = "tower" version = "0.5.3" @@ -1765,29 +1337,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "tungstenite" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8628dcc84e5a09eb3d8423d6cb682965dea9133204e8fb3efee74c2a0c259442" -dependencies = [ - "bytes", - "data-encoding", - "http", - "httparse", - "log", - "rand 0.9.2", - "sha1", - "thiserror", - "utf-8", -] - -[[package]] -name = "typenum" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" - [[package]] name = "unicode-ident" version = "1.0.24" @@ -1800,16 +1349,6 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" -[[package]] -name = "universal-hash" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" -dependencies = [ - "crypto-common", - "subtle", -] - [[package]] name = "untrusted" version = "0.9.0" @@ -1828,18 +1367,6 @@ dependencies = [ "serde", ] -[[package]] -name = "urlencoding" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" - -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "utf8_iter" version = "1.0.4" @@ -1848,21 +1375,15 @@ checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" [[package]] name = "uuid" -version = "1.22.0" +version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "ddd74a9687298c6858e9b88ec8935ec45d22e8fd5e6394fa1bd4e99a87789c76" dependencies = [ "getrandom 0.4.2", "js-sys", "wasm-bindgen", ] -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "want" version = "0.3.1" @@ -1880,11 +1401,11 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.3+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +checksum = "20064672db26d7cdc89c7798c48a0fdfac8213434a1186e5ef29fd560ae223d6" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.57.1", ] [[package]] @@ -1893,14 +1414,14 @@ version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ - "wit-bindgen", + "wit-bindgen 0.51.0", ] [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "df52b6d9b87e0c74c9edfa1eb2d9bf85e5d63515474513aa50fa181b3c4f5db1" dependencies = [ "cfg-if", "once_cell", @@ -1911,23 +1432,19 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.64" +version = "0.4.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "af934872acec734c2d80e6617bbb5ff4f12b052dd8e6332b0817bce889516084" dependencies = [ - "cfg-if", - "futures-util", "js-sys", - "once_cell", "wasm-bindgen", - "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "78b1041f495fb322e64aca85f5756b2172e35cd459376e67f2a6c9dffcedb103" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1935,9 +1452,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "9dcd0ff20416988a18ac686d4d4d0f6aae9ebf08a389ff5d29012b05af2a1b41" dependencies = [ "bumpalo", "proc-macro2", @@ -1948,9 +1465,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.120" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "49757b3c82ebf16c57d69365a142940b384176c24df52a087fb748e2085359ea" dependencies = [ "unicode-ident", ] @@ -1991,9 +1508,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.91" +version = "0.3.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +checksum = "2eadbac71025cd7b0834f20d1fe8472e8495821b4e9801eb0a60bd1f19827602" dependencies = [ "js-sys", "wasm-bindgen", @@ -2011,81 +1528,26 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.6" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" dependencies = [ "rustls-pki-types", ] -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows" -version = "0.57.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" -dependencies = [ - "windows-core 0.57.0", - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-core" -version = "0.57.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" -dependencies = [ - "windows-implement 0.57.0", - "windows-interface 0.57.0", - "windows-result 0.1.2", - "windows-targets 0.52.6", -] - [[package]] name = "windows-core" version = "0.62.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" dependencies = [ - "windows-implement 0.60.2", - "windows-interface 0.59.3", + "windows-implement", + "windows-interface", "windows-link", - "windows-result 0.4.1", + "windows-result", "windows-strings", ] -[[package]] -name = "windows-implement" -version = "0.57.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "windows-implement" version = "0.60.2" @@ -2097,17 +1559,6 @@ dependencies = [ "syn", ] -[[package]] -name = "windows-interface" -version = "0.57.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "windows-interface" version = "0.59.3" @@ -2125,15 +1576,6 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" -[[package]] -name = "windows-result" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" -dependencies = [ - "windows-targets 0.52.6", -] - [[package]] name = "windows-result" version = "0.4.1" @@ -2317,6 +1759,12 @@ dependencies = [ "wit-bindgen-rust-macro", ] +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + [[package]] name = "wit-bindgen-core" version = "0.51.0" @@ -2398,27 +1846,15 @@ dependencies = [ [[package]] name = "writeable" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" - -[[package]] -name = "x25519-dalek" -version = "2.0.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e468321c81fb07fa7f4c636c3972b9100f0346e5b6a9f2bd0603a52f7ed277" -dependencies = [ - "curve25519-dalek", - "rand_core 0.6.4", - "serde", - "zeroize", -] +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -2427,9 +1863,9 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", @@ -2439,18 +1875,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", @@ -2459,18 +1895,18 @@ dependencies = [ [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", @@ -2483,26 +1919,12 @@ name = "zeroize" version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" -dependencies = [ - "zeroize_derive", -] - -[[package]] -name = "zeroize_derive" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -2511,9 +1933,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -2522,9 +1944,9 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 27fe0e5..e5630fd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,30 +1,3 @@ -[package] -name = "devopsdefender" -version = "0.1.0" -edition = "2021" -license = "MIT" -repository = "https://github.com/devopsdefender/dd" - -[dependencies] -anyhow = "1" -axum = { version = "0.8", features = ["ws", "macros"] } -base64 = "0.22" -chrono = { version = "0.4", features = ["serde"] } -hex = "0.4" -futures-util = "0.3" -jsonwebtoken = "9" -libc = "0.2" -rand = "0.8" -reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } -serde = { version = "1", features = ["derive"] } -serde_json = "1" -snow = { version = "0.9", default-features = false, features = ["default-resolver"] } -sysinfo = { version = "0.33", default-features = false, features = ["system", "disk", "network"] } -thiserror = "2" -tokio = { version = "1", features = ["macros", "process", "rt-multi-thread", "signal", "time", "fs", "net", "io-util", "sync"] } -urlencoding = "2" -uuid = { version = "1", features = ["v4"] } -x25519-dalek = { version = "2", features = ["static_secrets"] } - -[dev-dependencies] -tempfile = "3" +[workspace] +members = ["crates/dd-agent"] +resolver = "2" diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 8922eb9..0000000 --- a/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# devopsdefender — unified binary for fleet management. -# -# DEVELOPMENT ONLY. CI/CD does not use this — easyenclave fetches the -# devopsdefender binary directly from this repo's GitHub releases via -# its github_release workload source. See .github/workflows/release.yml. -# This Dockerfile is kept for local development and ad-hoc image builds. -# -# Subcommands: -# devopsdefender management — control plane (register + dashboard) -# devopsdefender agent — in-VM agent - -FROM rust:1-bookworm AS builder -RUN rustup target add x86_64-unknown-linux-musl && \ - apt-get update && apt-get install -y --no-install-recommends musl-tools && \ - rm -rf /var/lib/apt/lists/* -WORKDIR /src -COPY . . -RUN cargo build --release -p devopsdefender --target x86_64-unknown-linux-musl - -FROM debian:bookworm-slim -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates curl \ - && curl -fsSL -o /usr/local/bin/cloudflared \ - https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \ - && chmod +x /usr/local/bin/cloudflared \ - && apt-get purge -y curl \ - && apt-get autoremove -y \ - && rm -rf /var/lib/apt/lists/* -COPY --from=builder /src/target/x86_64-unknown-linux-musl/release/devopsdefender /usr/local/bin/devopsdefender -ENTRYPOINT ["/usr/local/bin/devopsdefender"] diff --git a/README.md b/README.md index 2aa8280..8b649f2 100644 --- a/README.md +++ b/README.md @@ -1,130 +1,84 @@ # DevOps Defender -Confidential computing marketplace. Run AI workloads on hardware-sealed Intel TDX VMs with cryptographic attestation. +DD is an attested execution layer for open-source agent workloads. -## Architecture +This branch deletes the old v1 control-plane/fleet implementation and keeps the +new core primitive: a small agent with runtime ownership, GitHub OIDC deploys, +and a public proof document. -One Cargo crate, one static musl binary, two run modes: +## What Exists Now -``` -DD_MODE=cp devopsdefender # control-plane (dashboard, register, collector, STONITH) -DD_MODE=agent devopsdefender # in-VM agent (dashboard, /deploy, /logs, metrics) -``` - -Source layout (all under `src/`, flat module tree): - -| Module | Responsibility | -|---|---| -| `cp.rs` | CP HTTP: fleet dashboard, `/register` for agents, `/api/agents` public read, `/cp/attest`. Runs the collector + per-agent CF Access app + STONITH. | -| `agent.rs` | Agent HTTP: per-VM dashboard, `/deploy` + `/exec` + `/logs/{app}` + `/ingress/replace`, GitHub-OIDC and ITA verification. | -| `cf.rs` | Cloudflare API: tunnel CRUD, DNS CNAME, Access app provisioning, flat `label_hostname`, orphan reaping. | -| `ee.rs` | Thin client for [EasyEnclave](https://github.com/easyenclave/easyenclave)'s unix socket — `Deploy`, `List`, `Logs`. | -| `ita.rs` | Mint + verify Intel Trust Authority tokens (quote-v4 MRTD extraction). | -| `gh_oidc.rs` | Verify GitHub Actions OIDC JWTs against GitHub's JWKS (`repository_owner == DD_OWNER`). | -| `collector.rs` | CP-side scrape of agent `/health` over the tunnel; tracks claims + ingress. | -| `stonith.rs` | On CP boot, delete old tunnel → old cloudflared dies → old CP observes and `poweroff`s. | -| `metrics.rs` | Per-host CPU/disk/net via the `sysinfo` crate. | -| `config.rs` | Env → typed config for both modes. | -| `html.rs` | Dashboard templates. | - -The sealed enclave runtime is [EasyEnclave](https://github.com/easyenclave/easyenclave) — a separate project. +- `crates/dd-agent`: minimal Rust agent. +- `.github/actions/assign`: idempotently assigns an agent to a GitHub principal. +- `.github/actions/deploy`: deploys a workload from the current owner repo. +- `.github/actions/verify`: checks the public agent proof document. +- `docs/spec-v2.md`: product model. +- `docs/threat-model-v2.md`: operator, owner, and verifier boundaries. +- `docs/rewrite-plan.md`: remaining migration plan. -## Public website +## Model -[**devopsdefender.com**](https://devopsdefender.com) is a static site served from this repo's [`gh-pages` branch](https://github.com/devopsdefender/dd/tree/gh-pages) (CNAME pinned there). It's the **only** place public-facing marketing copy lives — the CP binary serves operator dashboards behind CF Access and is never the right home for public prose. +Every agent has one current owner principal: -To change the website: PR against `gh-pages` (not `main`). The branch's own `.github/workflows/website-preview.yml` auto-deploys each PR to `devopsdefender.com/pr-preview//` via [`rossjrw/pr-preview-action`](https://github.com/rossjrw/pr-preview-action); merging to `gh-pages` publishes to root. +- `user:#` +- `org:#` +- `repo:/#` -## Deployment +Ownership is runtime state. Reboot can clear it. External automation is expected +to call `/owner` repeatedly until the agent's `/health` proof reflects the +desired owner. -Every fleet VM boots from a sealed easyenclave image published by [easyenclave/easyenclave](https://github.com/easyenclave/easyenclave/releases). No cloud-init, no stock Ubuntu, no runtime `apt-get install`. The TDX VM's rootfs is the latest image in the `easyenclave-staging` (or `-stable`) family, attestable against a single UKI SHA256. +The assignment authority and deploy authority are separate: -Every workload is a JSON spec consumed by easyenclave's `DeployRequest`. Boot-time and runtime-deployed workloads share one schema; both the `devopsdefender` binary and `cloudflared` ship as **GitHub release assets** — not OCI images — and easyenclave fetches them via its `github_release` source. The full set of specs and a guide to writing your own lives in [`apps/README.md`](apps/README.md). +- `/owner` accepts the assignment authority's GitHub Actions OIDC token. +- `/deploy`, `/logs/{app}`, and `/exec` accept only the current owner's token. +- `/health` is public proof for users and third-party verifiers. -Per-VM configuration (CF credentials, GitHub OAuth, the workload spec itself) is passed to easyenclave at boot via **GCE instance metadata** (`ee-config` attribute), read by `easyenclave::init::fetch_gce_metadata_config()` and applied as env vars. The CP-deploy step in `.github/workflows/deploy-cp.yml` builds the spec and invokes `gcloud compute instances create --image-family=easyenclave-staging --metadata-from-file=ee-config=...`. +## Agent -## CI/CD +Run a local development agent: +```bash +DD_ASSIGNMENT_AUTHORITY_KIND=repo \ +DD_ASSIGNMENT_AUTHORITY_NAME=example/assigner \ +DD_ASSIGNMENT_AUTHORITY_ID=123456789 \ +cargo run -p dd-agent ``` -PR → pre-release tagged pr-{sha12}, then ephemeral preview at pr-{N}.{domain} -branch deleted → pr-teardown.yml deletes the preview's VM, CF tunnel, and DNS -push to main → rolling `latest` release, then auto-deploy to production -push v* tag → versioned release (no auto-deploy) -manual dispatch → redeploy any existing tag to production (rollback tool) -``` - -Every path lives in `.github/workflows/release.yml`: one `build` job, then either `deploy-preview` (PR) or `deploy-production` (main / dispatch), both calling the reusable `deploy-cp.yml` with env-specific inputs. Each cascades into a relaunch of the matching `dd-local-{env}` VM on the tdx2 host — the Release run only goes green when that agent re-registers with the freshly-deployed CP. Verifications along the way: - -1. `/health` via the Cloudflare tunnel (CF Access bypass; public) -2. `/cp/attest` returning a real TDX MRTD (CF Access bypass; the quote is self-authenticating — old VMs don't have the endpoint and return 404) -3. Dashboard `/` returning a CF Access redirect (HTTP 302) to the Cloudflare login flow -4. No other `dd-{env}-*` VM is RUNNING after deploy (STONITH must have halted the previous instance) -5. `dd-local-{env}` re-registers with the new CP within 5 min - -## Auth -Zero shared secrets. Every CP and agent URL is fronted by [Cloudflare Access](https://developers.cloudflare.com/cloudflare-one/applications/); everything else is gated by signed tokens validated in code. +Endpoints: -| Caller | Endpoint | Auth | +| Endpoint | Purpose | Auth | | --- | --- | --- | -| Human browser | CP `/`, agent `/`, ttyd terminal | CF Access → GitHub OAuth → `github-organization:DD_OWNER` or `emails:DD_ACCESS_ADMIN_EMAIL` | -| Agent → CP | `/register`, `/ingress/replace` | CF Access bypass + Intel ITA token verified in-code | -| CI → agent | `/deploy`, `/exec`, `/logs/{app}` | CF Access bypass + GitHub Actions OIDC JWT verified in-code (`repository_owner == DD_OWNER`) | -| Anyone | `/health`, `/cp/attest`, `/api/agents`, workload URLs | CF Access bypass; read-only or self-authenticating content | - -No PATs. No CF Access service tokens. No Worker. Agents ship with nothing but an ITA API key; CI ships with nothing but its per-job GitHub OIDC token. - -CF Access apps are provisioned programmatically by the CP at boot — one application per hostname (CP, agent, each admin-gated workload label like `-term`). Orphan apps from torn-down preview VMs are reaped on the next CP boot. - -First-time setup on a fresh Cloudflare account: -1. Zero Trust → Settings → Authentication → Login methods → add GitHub (`read:user` scope only). -2. Extend `DD_CF_API_TOKEN` with **Access: Apps and Policies: Edit** and **Access: Identity Providers: Read**. -3. Set repo var/secret `DD_ACCESS_ADMIN_EMAIL` (break-glass human login). -4. Deploy. No per-deploy bootstrap step. - -## Deploy a workload from GitHub Actions - -The [`dd-deploy`](.github/actions/dd-deploy/README.md) composite action mints a per-job OIDC token and POSTs any workload JSON to a DD agent. Works from any repository in the `DD_OWNER` GitHub org with zero stored credentials: - -```yaml -jobs: - deploy: - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - steps: - - uses: actions/checkout@v4 - - uses: devopsdefender/dd/.github/actions/dd-deploy@main - with: - cp-url: https://app.devopsdefender.com - vm-name: dd-local-prod - workload: apps/myapp/workload.json +| `GET /health` | public proof and liveness | none | +| `POST /owner` | set current owner | assignment authority OIDC | +| `POST /deploy` | launch workload | current owner OIDC | +| `GET /logs/{app}` | read workload logs | current owner OIDC | +| `POST /exec` | optional debug command | current owner OIDC + capability enabled | + +## Workload Shape + +```json +{ + "app_name": "oracle", + "cmd": ["/bin/sh", "-c", "echo oracle; sleep 60"], + "source": { + "repo": "example/oracle", + "ref": "refs/heads/main", + "commit": "..." + }, + "artifact_digest": "sha256:...", + "spec_digest": "sha256:..." +} ``` -The agent verifies the OIDC token against GitHub's JWKS, checks `repository_owner == DD_OWNER`, and launches the workload. Full inputs/outputs in [`.github/actions/dd-deploy/README.md`](.github/actions/dd-deploy/README.md). On deploy timeout, `dd-deploy` fetches `/logs/dd-agent` over the same OIDC auth so CI logs show agent-side ground truth without an SSH hop. +Production workload repositories should own their deploy workflows. The DD repo +provides the substrate and actions; it should not contain production bot, +oracle, or LLM-agent workloads. -## Terminal access - -Each VM runs [ttyd](https://github.com/tsl0922/ttyd) as a workload on a `-term` labelled subdomain (e.g. `app-term.devopsdefender.com`, `-term.devopsdefender.com`). CF Access gates it behind the same GitHub OAuth + admin-email policy as the dashboards — no SSH, no shared keys. - -## STONITH - -When a new CP boots, it needs to kick out the old one. It does this by deleting the old tunnel via the Cloudflare API — when the old `cloudflared` loses its tunnel, it exits, and the old CP observes the exit and calls `poweroff`. The old VM shuts down, GCP marks it TERMINATED. - -Old tunnels are identified by their **ingress configuration** (which hostname they serve), not by reconstructing a hostname from the tunnel name. This is the correct identifier because CP tunnels all serve `app-{env}.{domain}` regardless of their individual tunnel name. - -If STONITH fails, `release.yml` detects the surviving VM and fails the deploy — loud signal, no silent accumulation. - -## Build +## Validation ```bash -cargo build --release -# Produces: target/release/devopsdefender +cargo fmt --all -- --check +cargo check --workspace +cargo test --workspace ``` - -For local dev you can also build the Dockerfile (`docker build -t dd .`) but CI/CD does not — production deploys consume the GitHub release asset directly. - -## License - -MIT diff --git a/apps/README.md b/apps/README.md deleted file mode 100644 index 0ed5f85..0000000 --- a/apps/README.md +++ /dev/null @@ -1,169 +0,0 @@ -# apps/ — worked example of a DD agent VM - -This directory is **a worked example**, not a bundle dd ships to users. Every -directory here is one easyenclave workload. Together they describe a complete -DD agent VM: the minimum infra to boot podman, run one demo container -(`web-nvidia-smi`), register with a control plane, and expose the demo on a -stable hostname. - -The goal is to be the shortest legible "agent VM from scratch" that you can -copy and adapt. For orchestrating many workloads, assembling them from -templates, and the run / teardown lifecycle, see -[slopandmop](https://github.com/slopandmop/slopandmop). - -## Layout - -``` -apps/ - / - workload.json # literal spec - workload.json.tmpl # spec with ${VAR} placeholders (baked at deploy time) - _infra/ # host-side scripts; not a deployable workload -``` - -## What a workload looks like - -A **workload** is a JSON object consumed by easyenclave's `DeployRequest` (see -`src/easyenclave/src/workload.rs`). Minimum shape: - -```json -{ - "app_name": "myapp", - "cmd": ["/bin/busybox", "sh", "-c", "echo hello; sleep inf"] -} -``` - -Add `github_release` to fetch a binary asset directly from a GitHub release — -no OCI registry, no Dockerfile. The asset lands in `/var/lib/easyenclave/bin/` -and is spawned by `cmd`: - -```json -{ - "app_name": "cloudflared", - "github_release": { - "repo": "cloudflare/cloudflared", - "asset": "cloudflared-linux-amd64", - "rename": "cloudflared" - } -} -``` - -Add `env` to inject config: - -```json -{ - "env": ["MY_ENDPOINT=https://api.example.com", "DEBUG=1"] -} -``` - -Add `expose` to ask DD to route a public hostname to a workload's port: - -```json -{ - "app_name": "web-nvidia-smi", - "expose": { "hostname_label": "gpu", "port": 8081 }, - "cmd": [...] -} -``` - -At agent boot, `apps/_infra/local-agents.sh` collects every `expose` entry -into `DD_EXTRA_INGRESS`. dd-agent forwards them on `/register` and the CP -prepends them to the agent's cloudflared tunnel ingress. A workload declaring -`{"hostname_label": "gpu", "port": 8081}` becomes reachable at -`gpu.` — in addition to the default dashboard at -``. easyenclave itself ignores the field; it's a DD-level -hint about tunnel routing. - -Per-workload ingress is **boot-time only** today. Workloads POSTed later via -`/deploy` don't get auto-exposed — declare your exposure on boot workloads in -this tree. - -## Templates - -Files ending in `.json.tmpl` carry `${VAR}` placeholders. At bake time: - -1. `envsubst` substitutes every uppercase `${VAR}` that appears in the - template using the caller's environment. -2. `jq` drops env-array entries whose value ended up empty (so you can make - OAuth creds / optional secrets conditional by just leaving them unset). -3. The result is a plain `workload.json` ready for EE. - -Only uppercase placeholders get substituted — shell locals like `$i` or -`$((n+1))` inside `cmd` strings are left alone. The bake helper is duplicated -inline in two places so both lifecycle points behave identically: - -- `.github/workflows/deploy-cp.yml` (CI, for CP workloads) -- `apps/_infra/local-agents.sh` (tdx2 host, for agent VMs) - -## Where each workload runs - -| workload | CP VM | agent VM (preview) | agent VM (prod) | -|---|---|---|---| -| `cloudflared` | ✅ | ✅ | ✅ | -| `dd-agent` | | ✅ | ✅ | -| `dd-management` | ✅ | | | -| `nv` | | | ✅ (GPU insmod) | -| `podman-static` | | ✅ | ✅ | -| `podman-bootstrap` | | ✅ | ✅ | -| `web-nvidia-smi` | | | ✅ (`gpu.`) | - -CP stays slim: just `cloudflared` + `dd-management`. Preview agent VMs run a -bare agent + podman for CI to prove registration end-to-end. Prod agent VMs -add the GPU insmod and the `web-nvidia-smi` demo on `gpu.`. - -## Ordering - -EasyEnclave spawns boot workloads concurrently — there's no declared -dependency graph. Dependents self-sequence by polling for their prerequisites. -Worked examples from this tree: - -- `podman-bootstrap` waits for `podman-static`'s tarball - (`until [ -x $SRC/usr/local/bin/podman ]; do sleep 1; done`). -- `web-nvidia-smi`'s cmd waits for the wrapper - (`until [ -x /var/lib/easyenclave/bin/podman ]; do sleep 2; done`). - -Costs seconds of wasted polling at boot; easy to reason about; no -workload-runner changes needed. - -## Deploying your own - -1. Copy an existing folder as a starting point: - ``` - cp -r apps/cloudflared apps/myapp - $EDITOR apps/myapp/workload.json - ``` -2. Decide where it runs: - - **CP VM**: add a `bake apps/myapp/workload.json` line to the - workload-building `run:` step in `.github/workflows/deploy-cp.yml`. - - **Agent VM**: add the same call to `apps/_infra/local-agents.sh` in - `build_config_iso()`. - - **Ad-hoc, runtime-only**: POST the baked JSON to `/deploy` on a running - agent. The endpoint is CF-Access-bypassed and gated in-code by a - GitHub Actions OIDC JWT. From inside a GitHub Actions workflow - running in the `DD_OWNER` org: - ``` - OIDC=$(curl -fsSL \ - -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" \ - "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=dd-agent" | jq -r .value) - curl -fsS -X POST https:///deploy \ - -H "Authorization: Bearer ${OIDC}" \ - -H "Content-Type: application/json" \ - -d @apps/myapp/workload.json - ``` - -## Reference - -- Schema source of truth: - [`src/easyenclave/src/workload.rs`](../src/easyenclave/src/workload.rs) — - the `DeployRequest` struct EE deserializes on `/deploy`. `expose` is not in - this struct; EE silently ignores it. DD reads it at the bake + register - boundary. -- CP deploy caller: - [`.github/workflows/deploy-cp.yml`](../.github/workflows/deploy-cp.yml) — - inline `bake()` + CP workload set. -- Agent VM builder: - [`apps/_infra/local-agents.sh`](_infra/local-agents.sh) — inline `bake()` + - agent workload set per kind. -- Ingress plumbing: `src/cf.rs` (`create()` takes per-workload ingress), - `src/cp.rs` (`register` handler accepts `extra_ingress`), `src/agent.rs` - (reads `DD_EXTRA_INGRESS`, forwards on `/register`). diff --git a/apps/_infra/dd-relaunch-cp.sh b/apps/_infra/dd-relaunch-cp.sh deleted file mode 100755 index aeb5028..0000000 --- a/apps/_infra/dd-relaunch-cp.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env bash -# dd-relaunch-cp.sh — destroy and recreate a local TDX CP VM. -# -# Invoked over SSH by .github/actions/relaunch-cp during the -# `target: ssh` branch of deploy-cp.yml. Mirrors dd-relaunch.sh (the -# agent-side version) — pulls the PR's apps/_infra tree, tears down -# the existing dd-local-{env}-cp VM, runs local-cp.sh to redefine, -# and starts it. -# -# dd-relaunch-cp.sh [ref] [release-tag] -# -# Required env (SSH'd-in from CI secrets): -# CLOUDFLARE_API_TOKEN, CLOUDFLARE_ACCOUNT_ID, CLOUDFLARE_ZONE_ID -# DD_ACCESS_ADMIN_EMAIL -# DD_ITA_API_KEY -# -# DD_RELEASE_TAG (optional) — passed positionally as $4. - -set -euo pipefail - -ENV_LABEL="${1?usage: dd-relaunch-cp.sh [ref] [release-tag]}" -HOSTNAME="${2?hostname required}" -REF="${3:-main}" -export DD_RELEASE_TAG="${4:-${DD_RELEASE_TAG:-latest}}" - -: "${CLOUDFLARE_API_TOKEN?}" -: "${CLOUDFLARE_ACCOUNT_ID?}" -: "${CLOUDFLARE_ZONE_ID?}" -: "${DD_ACCESS_ADMIN_EMAIL?}" -: "${DD_ITA_API_KEY?}" - -cd /home/tdx2/src/dd - -# Refresh apps/ from the caller's ref. Limited checkout so unrelated -# dirty state doesn't block the deploy. Matches the agent path. -git fetch --quiet origin "$REF" -git checkout --quiet "origin/$REF" -- apps/ -echo "dd-relaunch-cp: refreshed apps/ from origin/$REF" - -# Sync the libvirt base qcow2 from the easyenclave release channel -# for this env. `production` tracks `stable` (v*); anything else -# (pr-N, dev) tracks `staging`. `DD_EE_TAG` overrides the channel -# default for pre-flight-testing a candidate release. -# shellcheck source=./ee-sync.sh -. ./apps/_infra/ee-sync.sh -case "$ENV_LABEL" in - production) export DD_EE_CHANNEL="${DD_EE_CHANNEL:-stable}" ;; - *) export DD_EE_CHANNEL="${DD_EE_CHANNEL:-staging}" ;; -esac -sync_base /var/lib/libvirt/images/easyenclave-local.qcow2 - -VM="dd-local-$ENV_LABEL-cp" - -./apps/_infra/local-cp.sh "$ENV_LABEL" "$HOSTNAME" -virsh start "$VM" -echo "relaunched $VM against https://$HOSTNAME" diff --git a/apps/_infra/dd-relaunch.sh b/apps/_infra/dd-relaunch.sh deleted file mode 100755 index 17817af..0000000 --- a/apps/_infra/dd-relaunch.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env bash -# dd-relaunch.sh — destroy and recreate one local TDX agent VM. -# -# Invoked over SSH by .github/actions/relaunch-agent during a Release -# cascade. Pulls the PR's (or main's) apps/_infra tree so this script -# and local-agents.sh are always the ones the caller authored. Tears -# down the existing VM + overlay, runs local-agents.sh to redefine, -# and starts the VM. -# -# dd-relaunch.sh prod https://app.devopsdefender.com main -# dd-relaunch.sh preview https://pr-N.devopsdefender.com feat/some-pr -# -# DD_ITA_API_KEY must be set in the environment. No GitHub PAT — the -# agent authenticates to the CP via ITA attestation at /register and -# uses a CF Access service token (received in the register response) -# for subsequent machine-to-machine calls. - -set -euo pipefail - -KIND="${1?usage: dd-relaunch.sh [ref] [release-tag]}" -CP="${2?cp url required}" -REF="${3:-main}" -# DD_RELEASE_TAG optional — defaults to "latest" (prod cascade) but CI -# passes the PR-specific tag for preview so the agent binary matches -# the CP binary (auth protocol, bootstrap shape, etc.). -export DD_RELEASE_TAG="${4:-${DD_RELEASE_TAG:-latest}}" -: "${DD_ITA_API_KEY?DD_ITA_API_KEY must be set}" - -case "$KIND" in - prod|preview) ;; - *) echo "unknown kind: $KIND (want prod|preview)" >&2; exit 2 ;; -esac - -cd /home/tdx2/src/dd - -# Refresh the infra scripts + apps/ tree from the caller's ref. Limited -# checkout so a dirty working tree elsewhere doesn't block the deploy. -# This script is already in memory, so the refresh takes effect on the -# *next* invocation. -git fetch --quiet origin "$REF" -git checkout --quiet "origin/$REF" -- apps/ -echo "dd-relaunch: refreshed apps/ from origin/$REF" - -# Keep the libvirt base qcow2 aligned with the easyenclave release -# channel for this env. `prod` tracks `stable` (v*); `preview` tracks -# `staging` (main-branch prereleases). `DD_EE_TAG` from CI pins a -# specific release for pre-flight testing a candidate. Running VMs -# hold the old inode via open fds, so `mv` in place is safe. -# shellcheck source=./ee-sync.sh -. ./apps/_infra/ee-sync.sh -case "$KIND" in - prod) export DD_EE_CHANNEL="${DD_EE_CHANNEL:-stable}" ;; - preview) export DD_EE_CHANNEL="${DD_EE_CHANNEL:-staging}" ;; -esac -sync_base /var/lib/libvirt/images/easyenclave-local.qcow2 - -vm="dd-local-$KIND" -overlay="/var/lib/libvirt/images/$vm.qcow2" - -virsh destroy "$vm" 2>/dev/null || true -virsh undefine "$vm" --managed-save --snapshots-metadata 2>/dev/null || true -rm -f "$overlay" - -# Redefine via local-agents.sh; "" skips the other slot. -case "$KIND" in - prod) ./apps/_infra/local-agents.sh "" "$CP" ;; - preview) ./apps/_infra/local-agents.sh "$CP" "" ;; -esac - -virsh start "$vm" -echo "relaunched $vm against $CP" diff --git a/apps/_infra/ee-sync.sh b/apps/_infra/ee-sync.sh deleted file mode 100755 index 597bad5..0000000 --- a/apps/_infra/ee-sync.sh +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env bash -# ee-sync.sh — keep `/var/lib/libvirt/images/easyenclave-local.qcow2` on -# the tdx2 host in sync with the right easyenclave release channel. -# -# Sourced by dd-relaunch.sh + dd-relaunch-cp.sh. Each relaunch: -# 1. Resolves the desired EE tag — explicit `DD_EE_TAG` wins, else -# the latest release matching `DD_EE_CHANNEL` (stable | staging). -# 2. Compares against the sidecar `.tag` file next to the qcow2. -# 3. Downloads + atomic-renames into place if different. -# -# Both prod and preview pass an explicit `DD_EE_TAG` pin from -# release.yml's deploy-production / deploy-preview `with:` blocks. -# Neither env tracks a channel dynamically — the pin only moves when -# release.yml is updated in a PR. `DD_EE_TAG` always wins over -# `DD_EE_CHANNEL` below; the channel-resolver fallback is kept only -# for the workflow_dispatch-with-blank-ee_tag rollback escape-hatch -# and for the (currently unused) GCP image-family path. -# -# The sidecar tag file (`.tag`) is the only persistent state -# besides the qcow2 itself. If it drifts (operator manually SCP'd a -# qcow2 without updating the tag), the next sync that hits a channel -# default will pull the channel's latest and overwrite — cheaper than -# hash-compare over a 300 MB file. - -# Intentionally no `set -e` here; callers already run under `set -euo`. -# We return a non-zero code from sync_base on hard failure so the -# caller's pipefail kills the relaunch before anything destructive. - -EE_REPO="${EE_REPO:-easyenclave/easyenclave}" -EE_ASSET_PATTERN="${EE_ASSET_PATTERN:-easyenclave-*-local-tdx-qcow2.qcow2}" - -sync_base() { - local base="${1:?usage: sync_base }" - local channel="${DD_EE_CHANNEL:-staging}" - local target="${DD_EE_TAG:-}" - - # Resolve target tag from channel if no explicit pin. - if [ -z "$target" ]; then - case "$channel" in - stable) - # Stable = latest non-prerelease. `--exclude-pre-releases` is a - # `gh release list` flag. - target=$(gh release list --repo "$EE_REPO" \ - --exclude-pre-releases --limit 1 \ - --json tagName -q '.[0].tagName' 2>/dev/null) - ;; - staging) - # Staging = newest prerelease. NOT `--limit 1` unfiltered: the - # day easyenclave cuts a `v*` stable tag, that release gets a - # later `createdAt` than every existing prerelease, and an - # unfiltered newest-first query would collapse staging onto - # stable — defeating the whole channel-split. Explicitly keep - # only `isPrerelease: true` entries. The first `image-*` tag - # cut on main before the v* release always wins. - target=$(gh release list --repo "$EE_REPO" --limit 20 \ - --json tagName,isPrerelease \ - -q '[.[] | select(.isPrerelease)][0].tagName' 2>/dev/null) - ;; - *) - echo "ee-sync: unknown DD_EE_CHANNEL=$channel (want stable|staging)" >&2 - return 2 - ;; - esac - if [ -z "$target" ]; then - echo "ee-sync: failed to resolve $channel tag from $EE_REPO (gh auth?)" >&2 - return 2 - fi - fi - - local current="" - [ -f "$base.tag" ] && current=$(cat "$base.tag") - - if [ "$target" = "$current" ] && [ -f "$base" ]; then - echo "ee-sync: $base @ $current (channel=$channel, up to date)" - return 0 - fi - - local tmp="$base.tmp.$$" - trap 'rm -f "$tmp"' RETURN - # Download is best-effort: if the candidate release doesn't yet carry - # a matching asset (e.g. a pre-merge-of-easyenclave#87 release, or a - # partial release-staging failure), keep the existing base rather - # than aborting the deploy. The tag sidecar is NOT updated in that - # case, so the next run retries. An existing base is always - # preferable to no base — worst-case we just keep running slightly - # stale EE until the asset reappears. - if ! gh release download "$target" --repo "$EE_REPO" \ - --pattern "$EE_ASSET_PATTERN" --output "$tmp" 2>&1; then - if [ -f "$base" ]; then - echo "ee-sync: $target has no '$EE_ASSET_PATTERN' asset yet; keeping existing $base (tag=$current)" >&2 - return 0 - fi - echo "ee-sync: download failed for $target (pattern=$EE_ASSET_PATTERN), no existing $base to fall back to" >&2 - return 3 - fi - - # Ensure libvirt can read it — qemu runs as libvirt-qemu:kvm. - chown libvirt-qemu:kvm "$tmp" 2>/dev/null || true - mv "$tmp" "$base" - echo "$target" > "$base.tag" - echo "ee-sync: $base ${current:-} -> $target (channel=$channel)" -} diff --git a/apps/_infra/local-agents.sh b/apps/_infra/local-agents.sh deleted file mode 100755 index beea88c..0000000 --- a/apps/_infra/local-agents.sh +++ /dev/null @@ -1,421 +0,0 @@ -#!/usr/bin/env bash -# local-agents.sh — define local TDX agent VMs on this host: -# -# dd-local-preview : no GPU, registers with the PR-preview CP. Bare -# agent + podman — no demo workload — so the release -# pipeline can prove registration + tunnel end-to-end -# against per-PR CPs without needing GPU hardware. -# dd-local-prod : H100 passthrough, registers with production. The -# web-nvidia-smi demo is NOT a boot workload — it's -# deployed post-registration by a Release workflow -# step using GitHub Actions OIDC against the agent's -# /deploy endpoint. Boot stays fast and minimal. -# dd-local-bot : no GPU, registers with production. Dedicated host -# for the Sats for Compute bot (or any always-on -# operator workload). Started/stopped manually — -# CI doesn't reprovision; deploy-bot.yml in the -# satsforcompute repo just dd-deploys the bot -# workload onto this agent's /deploy. Same boot -# chain as preview (cloudflared + dd-agent + ttyd -# + podman). Modest sizing. -# -# All three reuse the existing easyenclave base qcow2 via copy-on-write -# overlays; each gets its own config.iso baking in DD_CP_URL + -# DD_ITA_API_KEY for that target. No GitHub PAT — the agent -# authenticates to the CP via ITA attestation at /register and picks -# up a CF Access service token from the register response for all -# subsequent machine-to-machine calls. Libvirt XML is rendered from -# the existing `easyenclave-local` domain (strip hostdev for preview/bot). -# -# `EE_OWNER` (required) is the principal authorized to deploy to the -# baked agents — one of: -# a GitHub user OR org login (no '/'). Resolved via -# `gh api users/` to a numeric id and a -# user-vs-org kind. -# / a specific repository. Resolved via -# `gh api repos//` to a numeric id. -# Strictly tighter than the bare-login form. -# Both DD_OWNER_ID and DD_OWNER_KIND are derived from the resolved -# answer and baked alongside DD_OWNER. There is no default — pick a -# principal explicitly. CF Access dashboard membership only works -# for kind=org; user/repo fall back to admin-email-only. -# -# Usage: -# export DD_ITA_API_KEY="$(cat ~/.secrets/ita_api_key)" -# export EE_OWNER="devopsdefender" # or "alice", "alice/dd-foo", etc. -# ./apps/_infra/local-agents.sh -# -# Each URL arg is independent — pass "" to skip provisioning that VM: -# ./apps/_infra/local-agents.sh "" https://app.devopsdefender.com "" # prod only -# ./apps/_infra/local-agents.sh https://pr-N.devopsdefender.com "" "" # preview only -# ./apps/_infra/local-agents.sh "" "" https://app.devopsdefender.com # bot only -# ./apps/_infra/local-agents.sh "" https://app.devopsdefender.com https://app.devopsdefender.com # prod + bot -# -# After: virsh start dd-local-preview && virsh start dd-local-prod && virsh start dd-local-bot - -set -euo pipefail - -PREVIEW_CP="${1-}" -PROD_CP="${2-}" -BOT_CP="${3-}" -if [ -z "$PREVIEW_CP" ] && [ -z "$PROD_CP" ] && [ -z "$BOT_CP" ]; then - echo "usage: $0 " >&2 - exit 1 -fi -: "${DD_ITA_API_KEY?set DD_ITA_API_KEY}" -: "${EE_OWNER?set EE_OWNER (GitHub login or owner/repo path; no default)}" -# DD_RELEASE_TAG pins which devopsdefender binary the agent downloads. -# Defaults to "latest" for ad-hoc runs; the relaunch-agent action sets -# it to the PR's release tag so preview deploys test the PR binary. -DD_RELEASE_TAG="${DD_RELEASE_TAG:-latest}" - -# Resolve EE_OWNER to (id, kind) once via `gh api`. Hard-fails if the -# login or repo doesn't exist — better than baking a typo into a -# config.iso whose agent then 401s every deploy with no signal. -# Run once at script load so all three VMs (preview/prod/bot) share -# the same owner principal. -command -v gh >/dev/null || { echo "gh CLI required to resolve EE_OWNER" >&2; exit 1; } -if [[ "$EE_OWNER" == */* ]]; then - EE_OWNER_ID=$(gh api "repos/$EE_OWNER" -q .id) || { - echo "EE_OWNER='$EE_OWNER' (looks like a repo, contains '/') did not resolve via gh api" >&2 - exit 1 - } - EE_OWNER_KIND=repo -else - read -r EE_OWNER_ID _gh_type < <(gh api "users/$EE_OWNER" -q '"\(.id) \(.type)"') || { - echo "EE_OWNER='$EE_OWNER' did not resolve via gh api users/" >&2 - exit 1 - } - case "$_gh_type" in - User) EE_OWNER_KIND=user ;; - Organization) EE_OWNER_KIND=org ;; - *) echo "unexpected gh api type: $_gh_type" >&2; exit 1 ;; - esac -fi -unset _gh_type -echo " EE_OWNER=$EE_OWNER (kind=$EE_OWNER_KIND, id=$EE_OWNER_ID)" - -# Resolve repo root regardless of invoking CWD — the workload specs -# under apps// need absolute paths so bake() can find them. -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -IMG_DIR=/var/lib/libvirt/images -BASE="$IMG_DIR/easyenclave-local.qcow2" -BASE_DOMAIN="easyenclave-local" - -# Render one workload spec. Matches the helper inlined in -# .github/workflows/deploy-cp.yml — same envsubst + empty-entry strip, -# so boot-time (config.iso) and runtime (/deploy) see identical JSON. -# -# envsubst is restricted to the ALL-CAPS `${VAR}` references that -# appear in the template itself. Lowercase `$i`, `${i}`, and bare -# `$((…))` arithmetic inside shell cmd strings are left alone. -bake() { - case "$1" in - *.json.tmpl) - local vars - vars=$(grep -oE '\$\{[A-Z_][A-Z0-9_]*\}' "$1" | sort -u | tr -d '\n') - envsubst "$vars" < "$1" \ - | jq -c 'if .env then .env |= map(select(test("^[^=]+=.+"))) else . end' - ;; - *.json) - jq -c . "$1" - ;; - *) - echo "local-agents.sh: unknown workload file type: $1" >&2 - return 1 - ;; - esac -} - -# Extract `expose` entries from a stream of baked workloads and emit -# them as a comma-separated `label:port` string — the shape dd-agent -# expects in $DD_EXTRA_INGRESS. Using plain text (not JSON) avoids -# quote-escaping when the value gets substituted into the dd-agent -# workload template's `"DD_EXTRA_INGRESS=${DD_EXTRA_INGRESS}"` env -# entry: embedded `"` would close the outer JSON string early and -# produce invalid JSON (jq: "Invalid numeric literal"). -extract_extra_ingress() { - jq -rs 'map(select(.expose) | "\(.expose.hostname_label):\(.expose.port)") | join(",")' -} - -[ -r "$BASE" ] || { echo "missing $BASE" >&2; exit 1; } -virsh dominfo "$BASE_DOMAIN" >/dev/null 2>&1 || { - echo "base libvirt domain '$BASE_DOMAIN' not defined — rebuild the EE image first" >&2 - exit 1 -} - -env_from_url() { - local h - h=$(echo "$1" | sed -E 's#https?://##;s#/.*##') - case "$h" in - app.*) echo production ;; - *) echo "${h%%.*}" ;; - esac -} - -build_config_iso() { - # $1=name, $2=cp_url, $3=env_label, $4=with_gpu(yes/no) - local name="$1" cp="$2" env="$3" with_gpu="$4" - local out="$IMG_DIR/dd-local-$name-config.iso" - local tmp - tmp=$(mktemp -d) - trap "rm -rf $tmp" RETURN - - # Boot workload chain (EE spawns concurrently; dependents self-sequence - # via `until` loops): - # nv — insmod nvidia driver (prod only, first so device - # nodes exist by the time web-nvidia-smi runs) - # podman-static — fetch the podman tarball into /var/lib/easyenclave/bin - # podman-bootstrap — stage binaries, install /var/lib/easyenclave/bin/podman - # wrapper + containers.conf + policy.json - # web-nvidia-smi — prod only. Run nvidia/cuda container, serve - # `nvidia-smi` output on :8081. - # cloudflared — fetch binary (agent spawns the tunnel process) - # dd-agent — register with CP, serve workloads. Requests the - # gpu. ingress via $DD_EXTRA_INGRESS, - # computed below from `expose` entries on the - # baked workloads. - # web-nvidia-smi is intentionally NOT a boot workload — it's - # deployed post-registration by the Release workflow via GH OIDC. - # Boot is: nvidia driver (GPU only), podman runtime, cloudflared. - local bare_workloads - bare_workloads=$({ - # mount-data runs first so `/dev/vdc` is at `/data` by the time - # podman-bootstrap reaches its `mountpoint -q` wait (both spawn - # concurrently but EE's pre-fetch serializes binary downloads - # before boot-loop). - bake "$REPO_ROOT/apps/mount-data/workload.json" - [ "$with_gpu" = "yes" ] && bake "$REPO_ROOT/apps/nv/workload.json" - bake "$REPO_ROOT/apps/podman-static/workload.json" - bake "$REPO_ROOT/apps/podman-bootstrap/workload.json" - bake "$REPO_ROOT/apps/cloudflared/workload.json" - bake "$REPO_ROOT/apps/ttyd/workload.json" - }) - - local extra_ingress - extra_ingress=$(echo "$bare_workloads" | extract_extra_ingress) - - local workloads - workloads=$({ - echo "$bare_workloads" - DD_CP_URL="$cp" \ - DD_ITA_API_KEY="$DD_ITA_API_KEY" \ - DD_ENV="$env" \ - DD_VM_NAME="dd-local-$name" \ - DD_EXTRA_INGRESS="$extra_ingress" \ - DD_RELEASE_TAG="$DD_RELEASE_TAG" \ - DD_OWNER="$EE_OWNER" \ - DD_OWNER_ID="$EE_OWNER_ID" \ - DD_OWNER_KIND="$EE_OWNER_KIND" \ - bake "$REPO_ROOT/apps/dd-agent/workload.json.tmpl" - } | jq -cs '.') - - { - echo "EE_OWNER=$EE_OWNER" - echo "EE_OWNER_ID=$EE_OWNER_ID" - echo "EE_OWNER_KIND=$EE_OWNER_KIND" - echo "EE_BOOT_WORKLOADS=$workloads" - # EE capture-socket tee target. Kept for forward compatibility: a - # future workload (e.g. an attested proxy) can bind + listen on it. - # Unpatched EE images ignore the variable; patched EE falls back to - # running without capture when nothing is listening, so the - # boot-of-the-listener ≠ boot-of-the-writer race is non-fatal. - echo "EE_CAPTURE_SOCKET=/run/ee/capture.sock" - } > "$tmp/agent.env" - - # ext4 — EE rootfs has no iso9660 module. - truncate -s 4M "$out" - # `-O ^has_journal` — 4 MB is below ext4's journal min (~8 MB), - # silences "Filesystem too small for a journal". Config volume is - # read-only so journaling isn't needed anyway. - mkfs.ext4 -q -O ^has_journal -d "$tmp" "$out" - echo " wrote $out (env=$env, gpu=$with_gpu, extra_ingress=$extra_ingress)" -} - -build_overlay() { - # $1=name - # - # Just the root overlay — small, sparse, tracks EE boot state. - # Real workload storage (podman images, HF model weights) lives on - # a SEPARATE workload.qcow2 mounted at /dev/vdc inside the VM and - # sized per the DD capacity rule — see `build_workload_disk`. - local name="$1" - local overlay="$IMG_DIR/dd-local-$name.qcow2" - if [ -f "$overlay" ]; then - echo " overlay $overlay already exists (reusing)" - return - fi - qemu-img create -q -F qcow2 -b "$BASE" -f qcow2 "$overlay" 20G - echo " wrote $overlay (20G sparse, backing $BASE)" -} - -build_workload_disk() { - # $1=name $2=size-spec (e.g. 160G, 1920G) - # - # Persistent podman/model storage as a separate qcow2, ext4-formatted - # so EE's `mount-data` workload can mount it at `/data` (where - # podman-bootstrap looks for overlay driver backing). - # Sparse — the 1.92 TB GPU disk occupies <1 MB until something - # actually writes. Uses qemu-nbd + mkfs.ext4 for one-time format. - local name="$1" size="${2:-160G}" - local disk="$IMG_DIR/dd-local-$name-workload.qcow2" - if [ -f "$disk" ]; then - echo " workload disk $disk already exists (reusing)" - return - fi - qemu-img create -q -f qcow2 "$disk" "$size" - # Load nbd + pick first free /dev/nbdN. Idempotent. - sudo modprobe nbd max_part=8 2>/dev/null || true - local nbd - for n in /dev/nbd*; do - [ -b "$n" ] || continue - [ -s "/sys/block/$(basename "$n")/pid" ] && continue - nbd="$n" - break - done - [ -n "$nbd" ] || { echo "no free /dev/nbd*"; exit 1; } - sudo qemu-nbd --connect="$nbd" "$disk" - # Retry — qemu-nbd returns before the device is fully ready for IO. - for _ in 1 2 3 4 5; do - if sudo mkfs.ext4 -q -L workload "$nbd" 2>/dev/null; then - break - fi - sleep 1 - done - sudo qemu-nbd --disconnect "$nbd" >/dev/null - echo " wrote $disk ($size ext4, label=workload)" -} - -render_domain_xml() { - # $1=name, $2=with_gpu (yes/no) - local name="$1" with_gpu="$2" - local out="/tmp/dd-local-$name.xml" - - virsh dumpxml "$BASE_DOMAIN" > "$out" - - # Rename domain, strip UUID (libvirt regens), strip MAC (libvirt regens). - sed -i "s|$BASE_DOMAIN|dd-local-$name|" "$out" - sed -i '//d' "$out" - sed -i '/\n" - f" \n" - f" \n" - f" \n" - f" \n" -) -# Append after the last existing .. block so bus/slot -# assignment stays libvirt's job (no
specified). -x = re.sub(r"(\n)(?=(?:(?!).)*?)", r"\1" + new_disk, x, count=1, flags=re.DOTALL) -with open(xml_path, "w") as f: f.write(x) -PY - # Rewrite the serial/console log file — base XML points at - # /var/log/ee-local.log, which libvirt opens exclusively. Two VMs - # sharing the same path collide with "Device or resource busy". - sed -i "s|/var/log/ee-local\\.log|/var/log/ee-local-$name.log|g" "$out" - - # Size the VM per the DD capacity rule. Host has 243 GiB / 64 vCPU - # so even the GPU-prod shape leaves room for a CP VM + several - # preview agents concurrently. - # - # preview (no GPU): RAM = 16 GiB, vCPU = 4 - # prod (H100): RAM = 32 GiB, vCPU = 16 - local mem_kib vcpus - if [ "$with_gpu" = "yes" ]; then - # Capped at 32 GiB to stay under the VFIO RAM-discard listener - # limit (16M mappings; TDX guest_memfd discards at 4 KiB, so 64 GiB - # is the hard ceiling and 32 GiB leaves headroom for device - # regions). Raise via 1 GiB hugepages on the host before bumping. - mem_kib=33554432 # 32 GiB - vcpus=16 - else - mem_kib=16777216 # 16 GiB - vcpus=4 - fi - sed -i -E "s|[0-9]+|$mem_kib|" "$out" - sed -i -E "s|[0-9]+|$mem_kib|" "$out" - sed -i -E "s|[0-9]+|$vcpus|" "$out" - - # Wire QEMU's tdx-guest to the host's QGS unix socket so the guest's - # TDVMCALL for a quote actually reaches Intel's quote-generation - # service. Without this, configfs-tsm `outblob` returns 0 bytes → - # ITA mint POSTs an empty quote → Intel rejects → agent fails to - # register. Idempotent: skips if the launchSecurity element is - # already expanded. - if grep -q "" "$out"; then - sed -i "s|||" "$out" - fi - - if [ "$with_gpu" != "yes" ]; then - # Strip the block for the preview VM. - awk 'BEGIN{skip=0} - //{skip=0}' "$out" > "$out.tmp" && mv "$out.tmp" "$out" - fi - - echo "$out" -} - -define_agent() { - # $1=name, $2=cp_url, $3=with_gpu - local name="$1" cp="$2" with_gpu="$3" - local env_label - env_label=$(env_from_url "$cp") - - echo "== dd-local-$name → $cp (env=$env_label, gpu=$with_gpu) ==" - build_overlay "$name" - # Workload disk (/dev/vdc, ext4, mounted at /data by the mount-data - # boot workload). Sized per the DD capacity rule: - # prod (H100): 10 × (RAM+VRAM) = 10 × 192 = 1.92 TB - # preview: 10 × RAM = 10 × 16 = 160 GB - # Sparse qcow2, so only grows with actual writes. - if [ "$with_gpu" = "yes" ]; then - build_workload_disk "$name" 1920G - else - build_workload_disk "$name" 160G - fi - build_config_iso "$name" "$cp" "$env_label" "$with_gpu" - local xml - xml=$(render_domain_xml "$name" "$with_gpu") - virsh destroy "dd-local-$name" 2>/dev/null || true - virsh undefine "dd-local-$name" --managed-save --snapshots-metadata 2>/dev/null || true - virsh define "$xml" >/dev/null - echo " defined dd-local-$name (xml at $xml)" -} - -[ -n "$PREVIEW_CP" ] && define_agent preview "$PREVIEW_CP" no -[ -n "$PROD_CP" ] && define_agent prod "$PROD_CP" yes -[ -n "$BOT_CP" ] && define_agent bot "$BOT_CP" no - -echo -echo "done. start with:" -[ -n "$PREVIEW_CP" ] && echo " virsh start dd-local-preview" -[ -n "$PROD_CP" ] && echo " virsh start dd-local-prod" -[ -n "$BOT_CP" ] && echo " virsh start dd-local-bot" -echo -echo "watch registration (Ctrl-] to exit):" -[ -n "$PREVIEW_CP" ] && echo " virsh console dd-local-preview" -[ -n "$PROD_CP" ] && echo " virsh console dd-local-prod" -[ -n "$BOT_CP" ] && echo " virsh console dd-local-bot" - -# Explicit 0 — the tail `[ -n "$BOT_CP" ] && …` returns 1 when -# BOT_CP="" (preview/prod-only), bubbling up as the script exit -# status and tripping set -e in dd-relaunch.sh. Force success. -exit 0 diff --git a/apps/_infra/local-cp.sh b/apps/_infra/local-cp.sh deleted file mode 100755 index 6800e7b..0000000 --- a/apps/_infra/local-cp.sh +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env bash -# local-cp.sh — define a local TDX control-plane VM on this host. -# -# Mirrors local-agents.sh but boots dd-management (the CP) instead of -# dd-agent. Used by deploy-cp.yml's `target: ssh` branch — the SSH -# equivalent of spinning up a GCE TDX instance. Provides the same -# `dd-{env}-cp-*` CF tunnel shape prod is used to, so /api/agents, -# /register, /health, etc. work identically to the GCE path. -# -# Usage: -# apps/_infra/local-cp.sh -# env pr-N | staging | local-dev — same as DD_ENV -# hostname where the CP registers its CF tunnel -# (e.g. pr-42.devopsdefender.com) -# -# Required env (all from the calling workflow's secrets): -# CLOUDFLARE_API_TOKEN, CLOUDFLARE_ACCOUNT_ID, CLOUDFLARE_ZONE_ID -# DD_ACCESS_ADMIN_EMAIL, DD_ITA_API_KEY -# EE_OWNER GitHub login or owner/repo path (no default). -# Resolved at runtime via `gh api` to (id, kind); -# DD_OWNER_ID + DD_OWNER_KIND are derived from it. -# DD_RELEASE_TAG (defaults to "latest") -# -# Sizing: 16 GiB RAM / 4 vCPU / 160 GB qcow2 overlay — general shape -# from the DD capacity rule. CP doesn't need GPU; GPU stays on the -# prod agent VM (H100 passthrough). - -set -euo pipefail - -ENV_LABEL="${1?usage: $0 }" -HOSTNAME="${2?hostname required}" -: "${CLOUDFLARE_API_TOKEN?}" -: "${CLOUDFLARE_ACCOUNT_ID?}" -: "${CLOUDFLARE_ZONE_ID?}" -: "${DD_ACCESS_ADMIN_EMAIL?}" -: "${DD_ITA_API_KEY?}" -: "${EE_OWNER?set EE_OWNER (GitHub login or owner/repo path; no default)}" -DD_RELEASE_TAG="${DD_RELEASE_TAG:-latest}" - -# Resolve EE_OWNER to (id, kind) via gh api — same idiom as -# local-agents.sh. -command -v gh >/dev/null || { echo "gh CLI required to resolve EE_OWNER" >&2; exit 1; } -if [[ "$EE_OWNER" == */* ]]; then - EE_OWNER_ID=$(gh api "repos/$EE_OWNER" -q .id) || { - echo "EE_OWNER='$EE_OWNER' did not resolve via gh api repos/" >&2 - exit 1 - } - EE_OWNER_KIND=repo -else - read -r EE_OWNER_ID _gh_type < <(gh api "users/$EE_OWNER" -q '"\(.id) \(.type)"') || { - echo "EE_OWNER='$EE_OWNER' did not resolve via gh api users/" >&2 - exit 1 - } - case "$_gh_type" in - User) EE_OWNER_KIND=user ;; - Organization) EE_OWNER_KIND=org ;; - *) echo "unexpected gh api type: $_gh_type" >&2; exit 1 ;; - esac -fi -unset _gh_type -echo " EE_OWNER=$EE_OWNER (kind=$EE_OWNER_KIND, id=$EE_OWNER_ID)" -DD_DOMAIN="${DD_DOMAIN:-devopsdefender.com}" -DD_ITA_BASE_URL="${DD_ITA_BASE_URL:-https://api.trustauthority.intel.com}" -DD_ITA_JWKS_URL="${DD_ITA_JWKS_URL:-https://portal.trustauthority.intel.com/certs}" -DD_ITA_ISSUER="${DD_ITA_ISSUER:-https://portal.trustauthority.intel.com}" - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -IMG_DIR=/var/lib/libvirt/images -BASE="$IMG_DIR/easyenclave-local.qcow2" -BASE_DOMAIN="easyenclave-local" - -NAME="$ENV_LABEL-cp" -VM="dd-local-$NAME" - -[ -r "$BASE" ] || { echo "missing $BASE" >&2; exit 1; } -virsh dominfo "$BASE_DOMAIN" >/dev/null 2>&1 || { - echo "base libvirt domain '$BASE_DOMAIN' not defined — rebuild the EE image first" >&2 - exit 1 -} - -# Same bake helper as local-agents.sh — envsubst restricted to -# `${VAR}` refs the template declares, empty env entries stripped. -bake() { - case "$1" in - *.json.tmpl) - local vars - vars=$(grep -oE '\$\{[A-Z_][A-Z0-9_]*\}' "$1" | sort -u | tr -d '\n') - envsubst "$vars" < "$1" \ - | jq -c 'if .env then .env |= map(select(test("^[^=]+=.+"))) else . end' - ;; - *.json) - jq -c . "$1" - ;; - *) - echo "local-cp.sh: unknown workload file type: $1" >&2 - return 1 - ;; - esac -} - -build_config_iso() { - local out="$IMG_DIR/$VM-config.iso" - local tmp - tmp=$(mktemp -d) - trap "rm -rf $tmp" RETURN - - local workloads - workloads=$({ - bake "$REPO_ROOT/apps/cloudflared/workload.json" - DD_RELEASE_TAG="$DD_RELEASE_TAG" \ - CLOUDFLARE_API_TOKEN="$CLOUDFLARE_API_TOKEN" \ - CLOUDFLARE_ACCOUNT_ID="$CLOUDFLARE_ACCOUNT_ID" \ - CLOUDFLARE_ZONE_ID="$CLOUDFLARE_ZONE_ID" \ - DD_DOMAIN="$DD_DOMAIN" \ - DD_HOSTNAME="$HOSTNAME" \ - DD_ENV="$ENV_LABEL" \ - DD_ACCESS_ADMIN_EMAIL="$DD_ACCESS_ADMIN_EMAIL" \ - DD_ITA_API_KEY="$DD_ITA_API_KEY" \ - DD_ITA_BASE_URL="$DD_ITA_BASE_URL" \ - DD_ITA_JWKS_URL="$DD_ITA_JWKS_URL" \ - DD_ITA_ISSUER="$DD_ITA_ISSUER" \ - DD_OWNER="$EE_OWNER" \ - DD_OWNER_ID="$EE_OWNER_ID" \ - DD_OWNER_KIND="$EE_OWNER_KIND" \ - bake "$REPO_ROOT/apps/dd-management/workload.json.tmpl" - bake "$REPO_ROOT/apps/ttyd/workload.json" - } | jq -cs '.') - - { - echo "EE_OWNER=$EE_OWNER" - echo "EE_OWNER_ID=$EE_OWNER_ID" - echo "EE_OWNER_KIND=$EE_OWNER_KIND" - echo "EE_BOOT_WORKLOADS=$workloads" - echo "EE_CAPTURE_SOCKET=/run/ee/capture.sock" - } > "$tmp/agent.env" - - truncate -s 4M "$out" - # `-O ^has_journal` — 4 MB is below the ext4 journal min (~8 MB); - # silence "Filesystem too small for a journal" and skip journaling, - # which this read-only config volume doesn't need anyway. - mkfs.ext4 -q -O ^has_journal -d "$tmp" "$out" - echo " wrote $out (env=$ENV_LABEL, hostname=$HOSTNAME)" -} - -build_overlay() { - local overlay="$IMG_DIR/$VM.qcow2" - if [ -f "$overlay" ]; then - echo " overlay $overlay already exists (reusing)" - return - fi - # 160 GB — general shape from the DD capacity rule. Sparse qcow2. - qemu-img create -q -F qcow2 -b "$BASE" -f qcow2 "$overlay" 160G - echo " wrote $overlay (160G, backing $BASE)" -} - -render_domain_xml() { - local out="/tmp/$VM.xml" - virsh dumpxml "$BASE_DOMAIN" > "$out" - - sed -i "s|$BASE_DOMAIN|$VM|" "$out" - sed -i '//d' "$out" - sed -i '/[0-9]+|$mem_kib|" "$out" - sed -i -E "s|[0-9]+|$mem_kib|" "$out" - sed -i -E "s|[0-9]+|$vcpus|" "$out" - - # Strip GPU passthrough — CP doesn't need it and having two domains - # claim the same host device collides. - # Remove any blocks (vfio-pci H100). - python3 - "$out" <<'PY' -import re, sys -p = sys.argv[1] -with open(p) as f: x = f.read() -x = re.sub(r"\s*]*>.*?\n?", "", x, flags=re.DOTALL) -with open(p, "w") as f: f.write(x) -PY - - # Wire QEMU's tdx-guest to host's QGS unix socket — same treatment - # local-agents.sh does so ITA quotes work inside the CP VM. Must use - # libvirt's schema-valid form: `` - # (camelCase, path attribute). The earlier - # `vsock:2:4050` - # form is not in libvirt's RNG — `virsh define` accepts it but - # canonicalizes it away, leaving `` with - # no QGS wired → guest can't produce a quote → dd-management's ITA - # mint fails with "Quote cannot be empty" → CP poweroffs. - if grep -q "" "$out"; then - sed -i "s|||" "$out" - fi - - cat "$out" -} - -echo "== $VM → https://$HOSTNAME (env=$ENV_LABEL) ==" -build_overlay -build_config_iso -xml=$(render_domain_xml) -# Destroy any previous instance. rm on /var/log needs sudo (root-owned -# by libvirt); || true so a missing file or permission denial doesn't -# fail the deploy — libvirt will overwrite on domain start anyway. -virsh destroy "$VM" 2>/dev/null || true -virsh undefine "$VM" --managed-save --snapshots-metadata 2>/dev/null || true -sudo rm -f "/var/log/ee-local-$NAME.log" 2>/dev/null || true -echo "$xml" | virsh define /dev/stdin >/dev/null -echo " defined $VM" diff --git a/apps/cloudflared/workload.json b/apps/cloudflared/workload.json deleted file mode 100644 index 1b2270a..0000000 --- a/apps/cloudflared/workload.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "app_name": "cloudflared", - "github_release": { - "repo": "cloudflare/cloudflared", - "asset": "cloudflared-linux-amd64", - "rename": "cloudflared" - } -} diff --git a/apps/dd-agent/workload.json.tmpl b/apps/dd-agent/workload.json.tmpl deleted file mode 100644 index 5659527..0000000 --- a/apps/dd-agent/workload.json.tmpl +++ /dev/null @@ -1,25 +0,0 @@ -{ - "app_name": "dd-agent", - "github_release": { - "repo": "devopsdefender/dd", - "asset": "devopsdefender", - "tag": "${DD_RELEASE_TAG}" - }, - "inherit_token": true, - "cmd": ["devopsdefender", "agent"], - "env": [ - "DD_MODE=agent", - "DD_CP_URL=${DD_CP_URL}", - "DD_ITA_API_KEY=${DD_ITA_API_KEY}", - "DD_ITA_BASE_URL=https://api.trustauthority.intel.com", - "DD_ITA_JWKS_URL=https://portal.trustauthority.intel.com/certs", - "DD_ITA_ISSUER=https://portal.trustauthority.intel.com", - "DD_OWNER=${DD_OWNER}", - "DD_OWNER_ID=${DD_OWNER_ID}", - "DD_OWNER_KIND=${DD_OWNER_KIND}", - "DD_ENV=${DD_ENV}", - "DD_VM_NAME=${DD_VM_NAME}", - "DD_PORT=8080", - "DD_EXTRA_INGRESS=${DD_EXTRA_INGRESS}" - ] -} diff --git a/apps/dd-management/workload.json.tmpl b/apps/dd-management/workload.json.tmpl deleted file mode 100644 index 8762722..0000000 --- a/apps/dd-management/workload.json.tmpl +++ /dev/null @@ -1,28 +0,0 @@ -{ - "app_name": "dd-management", - "github_release": { - "repo": "devopsdefender/dd", - "asset": "devopsdefender", - "tag": "${DD_RELEASE_TAG}" - }, - "inherit_token": true, - "cmd": ["devopsdefender"], - "env": [ - "DD_MODE=management", - "DD_CF_API_TOKEN=${CLOUDFLARE_API_TOKEN}", - "DD_CF_ACCOUNT_ID=${CLOUDFLARE_ACCOUNT_ID}", - "DD_CF_ZONE_ID=${CLOUDFLARE_ZONE_ID}", - "DD_CF_DOMAIN=${DD_DOMAIN}", - "DD_HOSTNAME=${DD_HOSTNAME}", - "DD_ENV=${DD_ENV}", - "DD_OWNER=${DD_OWNER}", - "DD_OWNER_ID=${DD_OWNER_ID}", - "DD_OWNER_KIND=${DD_OWNER_KIND}", - "DD_ACCESS_ADMIN_EMAIL=${DD_ACCESS_ADMIN_EMAIL}", - "DD_PORT=8080", - "DD_ITA_API_KEY=${DD_ITA_API_KEY}", - "DD_ITA_BASE_URL=${DD_ITA_BASE_URL}", - "DD_ITA_JWKS_URL=${DD_ITA_JWKS_URL}", - "DD_ITA_ISSUER=${DD_ITA_ISSUER}" - ] -} diff --git a/apps/hello-world/workload.json b/apps/hello-world/workload.json deleted file mode 100644 index a619f14..0000000 --- a/apps/hello-world/workload.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "app_name": "hello-world", - "cmd": [ - "/bin/busybox", "sh", "-c", - "until [ -x /var/lib/easyenclave/bin/podman ]; do sleep 2; done\nexec /var/lib/easyenclave/bin/podman run --rm --name hello-world docker.io/library/busybox echo 'hello-world from dd preview'" - ] -} diff --git a/apps/mount-data/workload.json b/apps/mount-data/workload.json deleted file mode 100644 index 39536d5..0000000 --- a/apps/mount-data/workload.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "app_name": "mount-data", - "cmd": [ - "/bin/busybox", "sh", "-c", - "set -e; if [ -b /dev/vdc ]; then mkdir -p /data; mount -t ext4 /dev/vdc /data && echo mount-data: mounted /dev/vdc on /data; else echo mount-data: no /dev/vdc, skipping; fi" - ] -} diff --git a/apps/nv/workload.json b/apps/nv/workload.json deleted file mode 100644 index 94223a2..0000000 --- a/apps/nv/workload.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "app_name": "nv", - "cmd": [ - "/bin/busybox", "sh", "-c", - "set -e; K=/lib/modules/7.0.0-14-generic/kernel/nvidia-580srv-open; /sbin/insmod $K/nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 2>/dev/null || true; /sbin/insmod $K/nvidia-uvm.ko 2>/dev/null || true; [ -e /dev/nvidiactl ] || mknod -m 666 /dev/nvidiactl c 195 255; [ -e /dev/nvidia0 ] || mknod -m 666 /dev/nvidia0 c 195 0; U=$(awk '/ nvidia-uvm$/{print $1}' /proc/devices); if [ -n \"$U\" ]; then [ -e /dev/nvidia-uvm ] || mknod -m 666 /dev/nvidia-uvm c $U 0; [ -e /dev/nvidia-uvm-tools ] || mknod -m 666 /dev/nvidia-uvm-tools c $U 1; fi; echo nv: ready; ls /dev/nvidia* 2>&1" - ] -} diff --git a/apps/podman-bootstrap/workload.json b/apps/podman-bootstrap/workload.json deleted file mode 100644 index b325cd0..0000000 --- a/apps/podman-bootstrap/workload.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "app_name": "podman-bootstrap", - "cmd": [ - "/bin/busybox", "sh", "-c", - "set -e\nBIN=/var/lib/easyenclave/bin\nSRC=$BIN/podman-linux-amd64\nuntil [ -x $SRC/usr/local/bin/podman ]; do sleep 1; done\n# Wait for mount-data to mount /dev/vdc at /data before writing —\n# otherwise writes land on tmpfs and get shadowed the moment vdc is\n# mounted. On VMs without vdc (e.g. CP previews with no workload\n# disk) this check short-circuits.\nif [ -b /dev/vdc ]; then\n until mountpoint -q /data 2>/dev/null; do sleep 1; done\nfi\nmkdir -p /data\n# Stage helpers first (conmon, netavark, crun, etc.).\nfor f in $SRC/usr/local/bin/*; do\n name=$(basename $f)\n case $name in\n podman) cp -f $f $BIN/.podman-raw ;;\n *) cp -f $f $BIN/ ;;\n esac\ndone\ncp -f $SRC/usr/local/lib/podman/conmon $BIN/\ncp -f $SRC/usr/local/lib/podman/netavark $BIN/ 2>/dev/null || true\ncp -f $SRC/usr/local/lib/podman/aardvark-dns $BIN/ 2>/dev/null || true\ncp -f $SRC/usr/local/lib/podman/rootlessport $BIN/ 2>/dev/null || true\nmkdir -p /data/.podman/storage /data/.podman/runroot\n# /dev/shm holds podman's per-container POSIX shm lock file\n# (libpod_lock). EE may not mount tmpfs there; without it, podman\n# fails `failed to create 2048 locks in /libpod_lock`. Idempotent.\nif ! mountpoint -q /dev/shm 2>/dev/null; then\n mkdir -p /dev/shm\n mount -t tmpfs -o size=64M tmpfs /dev/shm 2>/dev/null || true\nfi\n# Pick storage driver: overlay on vdc-backed ext4; vfs elsewhere\n# (overlay-on-tmpfs errors out).\nif mountpoint -q /data; then\n DRIVER=overlay\nelse\n DRIVER=vfs\nfi\nPOL=/data/.podman/policy.json\nprintf '%s' '{\"default\":[{\"type\":\"insecureAcceptAnything\"}]}' > $POL\n# /etc and /root are RO on EE. Build a writable fake HOME for\n# policy.json + podman's default lookups.\nHOME_DIR=/var/lib/easyenclave/.home\nmkdir -p $HOME_DIR/.config/containers $HOME_DIR/tmp\ncp -f $POL $HOME_DIR/.config/containers/policy.json\nCONF=/data/.podman/containers.conf\nprintf '%s\\n' '[engine]' 'helper_binaries_dir = [\"/var/lib/easyenclave/bin\"]' > $CONF\n# Wrapper installed as $BIN/podman so bare `podman ps` (from PATH)\n# reaches the right storage root + driver. Raw binary lives at\n# $BIN/.podman-raw. $BIN/dd-podman stays as a back-compat symlink\n# since openclaw's workload calls dd-podman by name.\nprintf '%s\\n' '#!/bin/sh' \"export HOME=$HOME_DIR\" \"export TMPDIR=$HOME_DIR/tmp\" \"export CONTAINERS_CONF=$CONF\" \"exec $BIN/.podman-raw --conmon=$BIN/conmon --runtime=$BIN/crun --storage-driver=$DRIVER --root=/data/.podman/storage --runroot=/data/.podman/runroot --cgroup-manager=cgroupfs \\\"\\$@\\\"\" > $BIN/podman\nchmod +x $BIN/podman\nln -sf podman $BIN/dd-podman\nls -la $CONF $POL $BIN/podman $BIN/dd-podman $BIN/.podman-raw 2>&1 || true\necho podman-bootstrap: ok driver=$DRIVER conf=$CONF policy=$POL" - ] -} diff --git a/apps/podman-static/workload.json b/apps/podman-static/workload.json deleted file mode 100644 index 939125d..0000000 --- a/apps/podman-static/workload.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "app_name": "podman-static", - "github_release": { - "repo": "mgoltzsche/podman-static", - "asset": "podman-linux-amd64.tar.gz" - } -} diff --git a/apps/ttyd/workload.json b/apps/ttyd/workload.json deleted file mode 100644 index 9721d56..0000000 --- a/apps/ttyd/workload.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "app_name": "ttyd", - "github_release": { - "repo": "tsl0922/ttyd", - "asset": "ttyd.x86_64", - "rename": "ttyd" - }, - "expose": { "hostname_label": "block", "port": 7681 }, - "cmd": ["ttyd", "-W", "-p", "7681", "/bin/sh"] -} - diff --git a/apps/web-nvidia-smi/workload.json b/apps/web-nvidia-smi/workload.json deleted file mode 100644 index 96cb2d8..0000000 --- a/apps/web-nvidia-smi/workload.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "app_name": "web-nvidia-smi", - "expose": { "hostname_label": "gpu", "port": 8081 }, - "cmd": [ - "/bin/busybox", "sh", "-c", - "until [ -x /var/lib/easyenclave/bin/podman ]; do sleep 2; done\nexec /var/lib/easyenclave/bin/podman run --rm --name web-nvidia-smi --network=host --device=/dev/nvidia0 --device=/dev/nvidiactl --device=/dev/nvidia-uvm docker.io/nvidia/cuda:12.6.1-base-ubuntu22.04 sh -c 'set -e; apt-get update -qq && apt-get install -y -qq --no-install-recommends netcat-openbsd >/dev/null; while true; do (printf \"HTTP/1.0 200 OK\\r\\nContent-Type: text/plain\\r\\n\\r\\n\"; nvidia-smi) | nc -l -p 8081 -q 1; done'" - ] -} diff --git a/crates/dd-agent/Cargo.toml b/crates/dd-agent/Cargo.toml new file mode 100644 index 0000000..f60d264 --- /dev/null +++ b/crates/dd-agent/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "dd-agent" +version = "0.1.0" +edition = "2021" +license = "MIT" +repository = "https://github.com/devopsdefender/dd" + +[dependencies] +anyhow = "1" +axum = "0.8" +chrono = { version = "0.4", features = ["serde"] } +jsonwebtoken = "9" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +thiserror = "2" +tokio = { version = "1", features = ["io-util", "macros", "process", "rt-multi-thread", "sync", "time"] } +uuid = { version = "1", features = ["v4"] } diff --git a/crates/dd-agent/README.md b/crates/dd-agent/README.md new file mode 100644 index 0000000..3ca4633 --- /dev/null +++ b/crates/dd-agent/README.md @@ -0,0 +1,28 @@ +# dd-agent + +Minimal DD agent for the v2 ownership model. + +This crate implements the new product primitive directly: + +- public `GET /health` proof document +- idempotent runtime assignment via `POST /owner` +- current-owner-only workload deployment via `POST /deploy` +- current-owner-only logs via `GET /logs/{app}` +- optional current-owner-only exec via `POST /exec` + +It is not the finished confidential runtime. It is the smallest executable +shape for validating ownership and external-repo deployment before porting TDX, +EasyEnclave, ingress, and CP reconciliation code. + +## Run + +```bash +DD_ASSIGNMENT_AUTHORITY_KIND=repo \ +DD_ASSIGNMENT_AUTHORITY_NAME=example/assigner \ +DD_ASSIGNMENT_AUTHORITY_ID=123456789 \ +cargo run -p dd-agent +``` + +The assignment authority is the GitHub principal allowed to call `/owner`. +The current owner set by `/owner` is the only principal allowed to call +`/deploy`, `/logs/{app}`, and `/exec`. diff --git a/crates/dd-agent/src/main.rs b/crates/dd-agent/src/main.rs new file mode 100644 index 0000000..170758c --- /dev/null +++ b/crates/dd-agent/src/main.rs @@ -0,0 +1,793 @@ +use std::collections::HashMap; +use std::process::Stdio; +use std::sync::Arc; +use std::time::Duration; + +use axum::extract::{Path, State}; +use axum::http::{HeaderMap, StatusCode}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{get, post}; +use axum::{Json, Router}; +use chrono::{DateTime, Utc}; +use jsonwebtoken::{Algorithm, DecodingKey, Validation}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::sync::RwLock; + +const GITHUB_ISSUER: &str = "https://token.actions.githubusercontent.com"; +const GITHUB_JWKS_URL: &str = "https://token.actions.githubusercontent.com/.well-known/jwks"; +const DEFAULT_AUDIENCE: &str = "dd-agent"; +const MAX_LOG_LINES: usize = 2_000; + +type Result = std::result::Result; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cfg = Config::from_env()?; + let state = AppState::new(cfg); + let addr = format!("0.0.0.0:{}", state.cfg.port); + + let app = Router::new() + .route("/health", get(health)) + .route("/owner", post(set_owner)) + .route("/deploy", post(deploy)) + .route("/logs/{app}", get(logs)) + .route("/exec", post(exec)) + .with_state(state); + + eprintln!("dd-agent: listening on {addr}"); + let listener = tokio::net::TcpListener::bind(&addr).await?; + axum::serve(listener, app).await?; + Ok(()) +} + +#[derive(Clone)] +struct AppState { + cfg: Arc, + verifier: Arc, + started_at: DateTime, + owner: Arc>>, + workloads: Arc>>, +} + +impl AppState { + fn new(cfg: Config) -> Self { + let verifier = Arc::new(GithubOidc::new(cfg.oidc_audience.clone())); + Self { + cfg: Arc::new(cfg), + verifier, + started_at: Utc::now(), + owner: Arc::new(RwLock::new(None)), + workloads: Arc::new(RwLock::new(HashMap::new())), + } + } +} + +#[derive(Clone)] +struct Config { + agent_id: String, + hostname: String, + port: u16, + assignment_authority: Principal, + capabilities: Capabilities, + attestation: AttestationProof, + oidc_audience: String, +} + +impl Config { + fn from_env() -> anyhow::Result { + let agent_id = std::env::var("DD_AGENT_ID") + .unwrap_or_else(|_| format!("dd-agent-{}", uuid::Uuid::new_v4())); + let hostname = std::env::var("DD_HOSTNAME").unwrap_or_else(|_| "localhost".into()); + let port = std::env::var("DD_PORT") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(8080); + let assignment_authority = Principal::from_env("DD_ASSIGNMENT_AUTHORITY")?; + let capabilities = Capabilities { + runtime_deploy: env_truthy("DD_CAP_RUNTIME_DEPLOY", true), + exec: env_truthy("DD_CAP_EXEC", false), + interactive_shell: env_truthy("DD_CAP_INTERACTIVE_SHELL", false), + logs: env_truthy("DD_CAP_LOGS", true), + }; + let attestation = AttestationProof { + kind: std::env::var("DD_ATTESTATION_TYPE").unwrap_or_else(|_| "dev".into()), + quote_b64: std::env::var("DD_TDX_QUOTE_B64").ok(), + mrtd: std::env::var("DD_TDX_MRTD").ok(), + tcb_status: std::env::var("DD_TCB_STATUS").ok(), + }; + let oidc_audience = + std::env::var("DD_OIDC_AUDIENCE").unwrap_or_else(|_| DEFAULT_AUDIENCE.into()); + Ok(Self { + agent_id, + hostname, + port, + assignment_authority, + capabilities, + attestation, + oidc_audience, + }) + } +} + +fn env_truthy(key: &str, default: bool) -> bool { + match std::env::var(key) { + Ok(value) => matches!( + value.trim().to_ascii_lowercase().as_str(), + "1" | "true" | "yes" | "on" + ), + Err(_) => default, + } +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +#[serde(rename_all = "lowercase")] +enum PrincipalKind { + User, + Org, + Repo, +} + +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +struct Principal { + kind: PrincipalKind, + name: String, + id: u64, +} + +impl Principal { + fn from_env(prefix: &str) -> anyhow::Result { + let kind = match required_env(&format!("{prefix}_KIND"))?.as_str() { + "user" => PrincipalKind::User, + "org" => PrincipalKind::Org, + "repo" => PrincipalKind::Repo, + other => anyhow::bail!("{prefix}_KIND must be user|org|repo, got {other:?}"), + }; + let name = required_env(&format!("{prefix}_NAME"))?; + let id = required_env(&format!("{prefix}_ID"))?.parse::()?; + Self::validate(kind, name, id) + } + + fn validate(kind: PrincipalKind, name: String, id: u64) -> anyhow::Result { + if id == 0 { + anyhow::bail!("principal id must be non-zero"); + } + if name.trim().is_empty() { + anyhow::bail!("principal name must be non-empty"); + } + let has_slash = name.contains('/'); + let shape_ok = matches!( + (&kind, has_slash), + (PrincipalKind::Repo, true) + | (PrincipalKind::User, false) + | (PrincipalKind::Org, false) + ); + if !shape_ok { + anyhow::bail!("principal shape mismatch for {kind:?}: {name}"); + } + Ok(Self { kind, name, id }) + } + + fn matches(&self, claims: &GithubClaims) -> bool { + match self.kind { + PrincipalKind::User | PrincipalKind::Org => { + claims.repository_owner == self.name + && claims.repository_owner_id != 0 + && claims.repository_owner_id == self.id + } + PrincipalKind::Repo => { + claims.repository == self.name + && claims.repository_id != 0 + && claims.repository_id == self.id + } + } + } +} + +fn required_env(key: &str) -> anyhow::Result { + std::env::var(key) + .ok() + .filter(|value| !value.trim().is_empty()) + .ok_or_else(|| anyhow::anyhow!("{key} is required")) +} + +#[derive(Clone, Debug, Serialize)] +struct Capabilities { + runtime_deploy: bool, + exec: bool, + interactive_shell: bool, + logs: bool, +} + +#[derive(Clone, Debug, Serialize)] +struct AttestationProof { + #[serde(rename = "type")] + kind: String, + #[serde(skip_serializing_if = "Option::is_none")] + quote_b64: Option, + #[serde(skip_serializing_if = "Option::is_none")] + mrtd: Option, + #[serde(skip_serializing_if = "Option::is_none")] + tcb_status: Option, +} + +#[derive(Clone, Debug, Serialize)] +struct Assignment { + owner: Principal, + claim_id: String, + assigned_at: DateTime, + assigned_by: OidcActor, +} + +#[derive(Clone, Debug, Serialize)] +struct OidcActor { + sub: String, + repository: String, + repository_id: u64, + repository_owner: String, + repository_owner_id: u64, + workflow: String, + ref_: String, + sha: String, +} + +impl From<&GithubClaims> for OidcActor { + fn from(claims: &GithubClaims) -> Self { + Self { + sub: claims.sub.clone(), + repository: claims.repository.clone(), + repository_id: claims.repository_id, + repository_owner: claims.repository_owner.clone(), + repository_owner_id: claims.repository_owner_id, + workflow: claims.workflow.clone(), + ref_: claims.ref_.clone(), + sha: claims.sha.clone(), + } + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct SourceIdentity { + repo: String, + #[serde(default)] + ref_: String, + #[serde(default)] + commit: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +struct WorkloadSpec { + app_name: String, + cmd: Vec, + #[serde(default)] + source: Option, + #[serde(default)] + artifact_digest: Option, + #[serde(default)] + spec_digest: Option, + #[serde(default)] + env: HashMap, +} + +#[derive(Clone, Debug, Serialize)] +struct WorkloadRecord { + app_name: String, + source: Option, + artifact_digest: Option, + spec_digest: Option, + status: WorkloadStatus, + started_at: DateTime, + #[serde(skip_serializing_if = "Option::is_none")] + exited_at: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + exit_code: Option, + deployed_by: OidcActor, + #[serde(skip_serializing)] + logs: Vec, +} + +#[derive(Clone, Copy, Debug, Serialize)] +#[serde(rename_all = "snake_case")] +enum WorkloadStatus { + Running, + Exited, + Failed, +} + +#[derive(Debug, Deserialize)] +struct OwnerReq { + owner: Principal, + #[serde(default)] + claim_id: String, +} + +async fn health(State(state): State) -> Json { + let workloads: Vec = state.workloads.read().await.values().cloned().collect(); + let assignment = state.owner.read().await.clone(); + Json(serde_json::json!({ + "service": "dd-agent", + "ok": true, + "agent_id": state.cfg.agent_id, + "hostname": state.cfg.hostname, + "started_at": state.started_at, + "uptime_secs": (Utc::now() - state.started_at).num_seconds().max(0), + "assignment_authority": state.cfg.assignment_authority, + "owner": assignment.as_ref().map(|assignment| assignment.owner.clone()), + "assignment": assignment, + "attestation": state.cfg.attestation, + "capabilities": state.cfg.capabilities, + "workloads": workloads, + })) +} + +async fn set_owner( + State(state): State, + headers: HeaderMap, + Json(req): Json, +) -> Result> { + let claims = state + .verifier + .verify_principal(bearer(&headers)?, &state.cfg.assignment_authority) + .await?; + let claim_id = req.claim_id.trim().to_string(); + let mut guard = state.owner.write().await; + let previous = guard.clone(); + let changed = previous + .as_ref() + .map(|assignment| assignment.owner != req.owner || assignment.claim_id != claim_id) + .unwrap_or(true); + + if changed { + *guard = Some(Assignment { + owner: req.owner, + claim_id, + assigned_at: Utc::now(), + assigned_by: OidcActor::from(&claims), + }); + } + + Ok(Json(serde_json::json!({ + "agent_id": state.cfg.agent_id, + "changed": changed, + "previous": previous, + "assignment": guard.clone(), + }))) +} + +async fn deploy( + State(state): State, + headers: HeaderMap, + Json(spec): Json, +) -> Result> { + if !state.cfg.capabilities.runtime_deploy { + return Err(ApiError::Forbidden("runtime deployment is disabled".into())); + } + validate_workload(&spec)?; + let claims = require_current_owner(&state, &headers).await?; + let app_name = spec.app_name.clone(); + let record = WorkloadRecord { + app_name: app_name.clone(), + source: spec.source.clone(), + artifact_digest: spec.artifact_digest.clone(), + spec_digest: spec.spec_digest.clone(), + status: WorkloadStatus::Running, + started_at: Utc::now(), + exited_at: None, + exit_code: None, + deployed_by: OidcActor::from(&claims), + logs: Vec::new(), + }; + state + .workloads + .write() + .await + .insert(app_name.clone(), record); + spawn_workload(state.workloads.clone(), app_name.clone(), spec).await?; + Ok(Json(serde_json::json!({ + "agent_id": state.cfg.agent_id, + "app_name": app_name, + "status": "running", + }))) +} + +fn validate_workload(spec: &WorkloadSpec) -> Result<()> { + if spec.app_name.trim().is_empty() { + return Err(ApiError::BadRequest("app_name is required".into())); + } + if spec.cmd.is_empty() || spec.cmd[0].trim().is_empty() { + return Err(ApiError::BadRequest("cmd must be a non-empty array".into())); + } + Ok(()) +} + +async fn spawn_workload( + workloads: Arc>>, + app_name: String, + spec: WorkloadSpec, +) -> Result<()> { + let mut cmd = tokio::process::Command::new(&spec.cmd[0]); + cmd.args(spec.cmd.iter().skip(1)) + .envs(spec.env) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + let mut child = cmd + .spawn() + .map_err(|error| ApiError::BadRequest(format!("spawn {}: {error}", spec.cmd[0])))?; + + if let Some(stdout) = child.stdout.take() { + let workloads = workloads.clone(); + let app_name = app_name.clone(); + tokio::spawn(async move { + stream_logs(workloads, app_name, "stdout", stdout).await; + }); + } + if let Some(stderr) = child.stderr.take() { + let workloads = workloads.clone(); + let app_name = app_name.clone(); + tokio::spawn(async move { + stream_logs(workloads, app_name, "stderr", stderr).await; + }); + } + + tokio::spawn(async move { + let status = child.wait().await; + let mut guard = workloads.write().await; + if let Some(record) = guard.get_mut(&app_name) { + record.exited_at = Some(Utc::now()); + match status { + Ok(status) => { + record.exit_code = status.code(); + record.status = if status.success() { + WorkloadStatus::Exited + } else { + WorkloadStatus::Failed + }; + push_log( + &mut record.logs, + format!("process exited with status {status}"), + ); + } + Err(error) => { + record.status = WorkloadStatus::Failed; + push_log(&mut record.logs, format!("wait failed: {error}")); + } + } + } + }); + Ok(()) +} + +async fn stream_logs( + workloads: Arc>>, + app_name: String, + stream_name: &'static str, + stream: T, +) where + T: tokio::io::AsyncRead + Unpin, +{ + let mut lines = BufReader::new(stream).lines(); + while let Ok(Some(line)) = lines.next_line().await { + let mut guard = workloads.write().await; + if let Some(record) = guard.get_mut(&app_name) { + push_log(&mut record.logs, format!("[{stream_name}] {line}")); + } + } +} + +fn push_log(logs: &mut Vec, line: String) { + logs.push(line); + if logs.len() > MAX_LOG_LINES { + let excess = logs.len() - MAX_LOG_LINES; + logs.drain(0..excess); + } +} + +async fn logs( + State(state): State, + Path(app): Path, + headers: HeaderMap, +) -> Result> { + if !state.cfg.capabilities.logs { + return Err(ApiError::Forbidden("logs are disabled".into())); + } + let _ = require_current_owner(&state, &headers).await?; + let guard = state.workloads.read().await; + let record = guard.get(&app).ok_or(ApiError::NotFound)?; + Ok(Json(serde_json::json!({ + "app_name": app, + "status": record.status, + "lines": record.logs, + }))) +} + +#[derive(Debug, Deserialize)] +struct ExecReq { + cmd: Vec, + #[serde(default = "default_exec_timeout_secs")] + timeout_secs: u64, +} + +fn default_exec_timeout_secs() -> u64 { + 30 +} + +async fn exec( + State(state): State, + headers: HeaderMap, + Json(req): Json, +) -> Result> { + if !state.cfg.capabilities.exec { + return Err(ApiError::Forbidden("exec is disabled".into())); + } + if req.cmd.is_empty() || req.cmd[0].trim().is_empty() { + return Err(ApiError::BadRequest("cmd must be a non-empty array".into())); + } + let _ = require_current_owner(&state, &headers).await?; + let output = tokio::time::timeout( + Duration::from_secs(req.timeout_secs), + tokio::process::Command::new(&req.cmd[0]) + .args(req.cmd.iter().skip(1)) + .output(), + ) + .await + .map_err(|_| ApiError::BadRequest("exec timed out".into()))? + .map_err(|error| ApiError::BadRequest(format!("exec failed: {error}")))?; + + Ok(Json(serde_json::json!({ + "status": output.status.code(), + "success": output.status.success(), + "stdout": String::from_utf8_lossy(&output.stdout), + "stderr": String::from_utf8_lossy(&output.stderr), + }))) +} + +async fn require_current_owner(state: &AppState, headers: &HeaderMap) -> Result { + let owner = state + .owner + .read() + .await + .as_ref() + .map(|assignment| assignment.owner.clone()) + .ok_or_else(|| ApiError::Conflict("agent has no current owner".into()))?; + state + .verifier + .verify_principal(bearer(headers)?, &owner) + .await +} + +fn bearer(headers: &HeaderMap) -> Result<&str> { + let auth = headers + .get(axum::http::header::AUTHORIZATION) + .and_then(|value| value.to_str().ok()) + .ok_or(ApiError::Unauthorized)?; + auth.strip_prefix("Bearer ") + .or_else(|| auth.strip_prefix("bearer ")) + .map(str::trim) + .filter(|token| !token.is_empty()) + .ok_or(ApiError::Unauthorized) +} + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +struct GithubClaims { + exp: i64, + iat: i64, + iss: String, + #[serde(default)] + sub: String, + #[serde(default)] + repository: String, + #[serde(default)] + repository_id: u64, + #[serde(default)] + repository_owner: String, + #[serde(default)] + repository_owner_id: u64, + #[serde(default, rename = "ref")] + ref_: String, + #[serde(default)] + workflow: String, + #[serde(default)] + sha: String, +} + +struct GithubOidc { + audience: String, + http: Client, + keys: RwLock>, +} + +impl GithubOidc { + fn new(audience: String) -> Self { + Self { + audience, + http: Client::new(), + keys: RwLock::new(HashMap::new()), + } + } + + async fn verify_principal(&self, token: &str, principal: &Principal) -> Result { + let claims = self.decode_and_validate(token).await?; + if principal.matches(&claims) { + Ok(claims) + } else { + Err(ApiError::Unauthorized) + } + } + + async fn decode_and_validate(&self, token: &str) -> Result { + let header = jsonwebtoken::decode_header(token) + .map_err(|error| ApiError::BadRequest(format!("gh oidc header: {error}")))?; + if !allowed_alg(header.alg) { + return Err(ApiError::BadRequest(format!( + "gh oidc alg {:?} not allowed", + header.alg + ))); + } + let kid = header + .kid + .ok_or_else(|| ApiError::BadRequest("gh oidc token missing kid".into()))?; + let key = + match self.keys.read().await.get(&kid).cloned() { + Some(key) => key, + None => { + self.refresh().await?; + self.keys.read().await.get(&kid).cloned().ok_or_else(|| { + ApiError::BadRequest(format!("gh oidc kid {kid} not found")) + })? + } + }; + let mut validation = Validation::new(header.alg); + validation.set_issuer(&[GITHUB_ISSUER]); + validation.set_audience(&[self.audience.as_str()]); + validation.leeway = 60; + validation.set_required_spec_claims(&["exp", "iat", "iss", "aud"]); + let data = jsonwebtoken::decode::(token, &key, &validation) + .map_err(|error| ApiError::BadRequest(format!("gh oidc verify: {error}")))?; + Ok(data.claims) + } + + async fn refresh(&self) -> Result<()> { + let resp = self + .http + .get(GITHUB_JWKS_URL) + .send() + .await + .map_err(|error| ApiError::Upstream(format!("GH JWKS fetch: {error}")))?; + if !resp.status().is_success() { + return Err(ApiError::Upstream(format!( + "GH JWKS fetch returned HTTP {}", + resp.status() + ))); + } + let jwks: jsonwebtoken::jwk::JwkSet = resp + .json() + .await + .map_err(|error| ApiError::Upstream(format!("GH JWKS parse: {error}")))?; + let mut keys = HashMap::new(); + for jwk in &jwks.keys { + let Some(kid) = &jwk.common.key_id else { + continue; + }; + if let Ok(key) = DecodingKey::from_jwk(jwk) { + keys.insert(kid.clone(), key); + } + } + *self.keys.write().await = keys; + Ok(()) + } +} + +fn allowed_alg(alg: Algorithm) -> bool { + matches!( + alg, + Algorithm::RS256 + | Algorithm::RS384 + | Algorithm::RS512 + | Algorithm::PS256 + | Algorithm::PS384 + | Algorithm::PS512 + ) +} + +#[derive(Debug, thiserror::Error)] +enum ApiError { + #[error("{0}")] + BadRequest(String), + #[error("unauthorized")] + Unauthorized, + #[error("{0}")] + Forbidden(String), + #[error("{0}")] + Conflict(String), + #[error("not found")] + NotFound, + #[error("{0}")] + Upstream(String), +} + +impl IntoResponse for ApiError { + fn into_response(self) -> Response { + let status = match self { + ApiError::BadRequest(_) => StatusCode::BAD_REQUEST, + ApiError::Unauthorized => StatusCode::UNAUTHORIZED, + ApiError::Forbidden(_) => StatusCode::FORBIDDEN, + ApiError::Conflict(_) => StatusCode::CONFLICT, + ApiError::NotFound => StatusCode::NOT_FOUND, + ApiError::Upstream(_) => StatusCode::BAD_GATEWAY, + }; + let body = Json(serde_json::json!({ + "error": self.to_string(), + })); + (status, body).into_response() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn claims_for_repo() -> GithubClaims { + GithubClaims { + repository: "example/oracle".into(), + repository_id: 42, + repository_owner: "example".into(), + repository_owner_id: 7, + ..GithubClaims::default() + } + } + + #[test] + fn principal_shape_validation() { + assert!(Principal::validate(PrincipalKind::Repo, "example/oracle".into(), 42).is_ok()); + assert!(Principal::validate(PrincipalKind::Repo, "example".into(), 42).is_err()); + assert!(Principal::validate(PrincipalKind::Org, "example/oracle".into(), 7).is_err()); + assert!(Principal::validate(PrincipalKind::User, "alice".into(), 1).is_ok()); + assert!(Principal::validate(PrincipalKind::User, "alice".into(), 0).is_err()); + } + + #[test] + fn repo_principal_matches_repo_claims() { + let principal = + Principal::validate(PrincipalKind::Repo, "example/oracle".into(), 42).unwrap(); + assert!(principal.matches(&claims_for_repo())); + } + + #[test] + fn org_principal_matches_owner_claims() { + let principal = Principal::validate(PrincipalKind::Org, "example".into(), 7).unwrap(); + assert!(principal.matches(&claims_for_repo())); + } + + #[test] + fn principal_matching_requires_numeric_id() { + let principal = + Principal::validate(PrincipalKind::Repo, "example/oracle".into(), 999).unwrap(); + assert!(!principal.matches(&claims_for_repo())); + } + + #[test] + fn workload_requires_app_and_command() { + let ok = WorkloadSpec { + app_name: "oracle".into(), + cmd: vec!["/bin/echo".into(), "ok".into()], + source: None, + artifact_digest: None, + spec_digest: None, + env: HashMap::new(), + }; + assert!(validate_workload(&ok).is_ok()); + + let missing_cmd = WorkloadSpec { + cmd: Vec::new(), + ..ok.clone() + }; + assert!(validate_workload(&missing_cmd).is_err()); + + let missing_app = WorkloadSpec { + app_name: String::new(), + ..ok + }; + assert!(validate_workload(&missing_app).is_err()); + } +} diff --git a/docs/rewrite-plan.md b/docs/rewrite-plan.md new file mode 100644 index 0000000..ef9a6bd --- /dev/null +++ b/docs/rewrite-plan.md @@ -0,0 +1,217 @@ +# DD v2 Rewrite Plan + +The current repository grew organically around DD's own fleet and CI. The v2 +rewrite should start from the product primitives instead: agents, assignment, +workloads, and proofs. + +## Target Shape + +### Crates or Repos + +1. `dd-agent` + - small Rust binary + - owns attestation, assignment, deploy, logs, proof + - no Cloudflare-specific fleet assumptions in core logic + +2. `dd-cp` + - optional control plane + - discovers agents, reconciles assignments, hosts dashboard + - useful for managed fleets, not required for self-hosted agents + +3. `dd-action` + - GitHub Actions interface + - `assign`, `deploy`, `verify` + - stable public API for workload repositories + +4. `dd-spec` + - workload schema + - proof schema + - assignment schema + - verifier rules and examples + +5. `examples` + - confidential coding LLM + - OpenClaw/Hermes-style bot + - crypto oracle + - self-hosted single-agent setup + +## Agent Rewrite + +Keep the agent intentionally boring. + +### State + +```rust +struct AgentState { + agent_id: AgentId, + assignment_authority: Principal, + current_owner: Option, + workloads: WorkloadStore, + attestation: AttestationBundle, + capabilities: CapabilitySet, +} +``` + +`current_owner` is runtime state. It can reset on reboot. The external assigner +is responsible for reconciling desired state. + +### Endpoints + +- `GET /health` +- `POST /owner` +- `POST /deploy` +- `GET /logs/{app}` +- `POST /exec` if capability-enabled + +Everything else belongs in the optional CP or examples. + +### Authorization + +- `/owner`: assignment authority only +- `/deploy`: current owner only +- `/logs`: current owner only unless workload marks logs public +- `/exec`: current owner only and capability-enabled +- `/health`: public + +No endpoint should mean "fleet owner OR tenant owner" for workload control. +There is one current deploy owner. + +## Control Plane Rewrite + +The CP should not be the trust root for open-source workload verification. +It is coordination. + +Responsibilities: + +- maintain desired assignment records +- call `/owner` until actual state matches desired state +- index proof documents +- provide dashboards +- manage ingress if configured +- expose assignment audit trails + +The CP should tolerate agents that are: + +- self-hosted +- managed by another operator +- temporarily unassigned after reboot +- assigned to repo principals outside the DD org + +## GitHub Action Rewrite + +The action is the product interface for external repos. + +Commands: + +```yaml +- uses: devopsdefender/dd-action/verify@v2 +- uses: devopsdefender/dd-action/deploy@v2 +- uses: devopsdefender/dd-action/assign@v2 +``` + +### `verify` + +- fetch proof document +- verify TDX quote +- verify owner principal +- verify capabilities against selected profile +- emit normalized JSON for downstream steps + +### `deploy` + +- mint GitHub OIDC token +- verify proof owner matches caller +- submit workload spec +- poll proof until workload appears +- fail if `deployed_by` does not match caller claims + +### `assign` + +- resolve GitHub principal IDs +- call `/owner` +- poll proof until owner matches +- safe to repeat forever + +## CI/CD Rewrite + +Split CI into product concerns and internal fleet concerns. + +### Product CI + +Runs on every PR: + +- format +- lint +- unit tests +- schema validation +- build static artifacts +- generate provenance + +### Release CI + +Runs on tags or main release branches: + +- publish `dd-agent` +- publish `dd-cp` +- publish `dd-action` +- publish schemas +- attach provenance + +### Internal Fleet CI + +Runs only for DD-operated infra: + +- deploy DD's own CP +- relaunch DD's own demo agents +- clean up preview infrastructure + +This should live under a clearly named internal workflow boundary. It should +not define how user workloads are expected to deploy. + +## Migration Steps + +1. Keep the v1 deletion landed so old fleet assumptions do not keep shaping new code. +2. Extract principal parsing and GitHub OIDC verification into a small reusable module. +3. Harden the proof schema and verifier action. +4. Add TDX quote capture and verification to `dd-agent`. +5. Port EasyEnclave workload execution behind the current-owner-only API. +6. Build a small reconciler that repeatedly assigns owners. +7. Port only necessary Cloudflare/ingress code into `dd-cp`. +8. Move examples into standalone workload repositories. +9. Add release provenance for the agent and action surfaces. + +## Design Rules + +- Do not optimize for DD's own tdx2 host in core code. +- Do not make workload repos depend on the DD repo's CI. +- Do not persist owner locally as source of truth. +- Do not give operators deploy authority unless they are current owner. +- Do not hide mutability behind marketing terms like "confidential." +- Do expose enough proof for external verifiers to reject unsafe agents. + +## First Implementation Milestone + +A useful v2 alpha is: + +- single self-hosted agent +- runtime assignment +- GitHub OIDC deploy from an external repo +- public proof document +- verifier action + +No dashboard, no fleet cleanup, no GPU demo, no preview environments. Those can +come after the primitive works. + +## Current Alpha Slice + +This repo now carries the v2 alpha as the only Rust build target: + +- `crates/dd-agent` implements the minimal agent primitive. +- `.github/actions/assign` reconciles runtime owner assignment. +- `.github/actions/deploy` deploys from an external workload repo. +- `.github/actions/verify` checks the public proof document. + +The v1 control plane, fleet workflows, EasyEnclave app examples, and +tdx2-specific scripts have been removed from this branch. The remaining work is +to port only the parts that fit the v2 model: TDX proof, EasyEnclave execution, +Cloudflare ingress, and managed-fleet reconciliation. diff --git a/docs/spec-v2.md b/docs/spec-v2.md new file mode 100644 index 0000000..acd01ea --- /dev/null +++ b/docs/spec-v2.md @@ -0,0 +1,221 @@ +# DD v2 Product Spec + +DD v2 is an attested execution layer for open-source agent workloads. +The core product is not "CI/CD for this repo"; it is a way to lease or +self-run confidential agents whose current deploy authority is a GitHub +principal and whose runtime state can be verified by third parties. + +## Core Use Cases + +1. **Confidential LLM coding agents** + - A user assigns an agent to their GitHub user, organization, or repo. + - Their repo deploys an open-source coding agent workload by GitHub OIDC. + - The user verifies TDX attestation, source provenance, and live workload state. + +2. **Autonomous open-source bots** + - Projects like OpenClaw or Hermes live in their own repositories. + - Their own CI deploys to assigned agents; the DD repo is only substrate. + - Reassignment after reboot is expected and idempotent. + +3. **Confidential crypto oracles** + - Oracle code is public and built from a public repo/ref. + - Consumers verify the agent, workload source, build provenance, and attestation. + - Operators may provide infrastructure, but cannot silently become workload owner. + +## Product Primitives + +### Agent + +An agent is a TDX-backed runtime with: + +- a stable `agent_id` +- a public `hostname` +- a current owner principal +- an assignment authority +- a current workload set +- a public proof document + +An agent does not need durable local ownership state. On reboot it may start +unassigned, then an external assigner reconciles it back to the desired owner. + +### Principal + +A principal is one of: + +- `user:#` +- `org:#` +- `repo:/#` + +The numeric GitHub ID is required. Name-only matching is not sufficient because +deleted or transferred GitHub names can be re-registered. + +### Assignment + +Assignment sets the agent's current owner principal. + +Properties: + +- runtime state, not durable VM truth +- idempotent for the same owner and claim +- safe to repeat after every reboot +- authorized by the assignment authority, not by the current owner +- auditable by `claim_id` + +The assignment authority may be: + +- the self-hosting user +- a managed fleet operator +- a billing/lease controller +- a repo-specific automation account + +This is still one product mode. The operator controls infrastructure lifecycle; +the current owner controls workload deployment. + +### Workload + +A workload is an open-source repo artifact plus runtime spec. + +Minimum identity: + +- source repository +- git ref or immutable commit +- build workflow identity +- artifact digest +- workload spec digest + +The DD repo should not contain production workloads except examples. Real bots, +LLM agents, and oracles live in their own repositories and deploy themselves. + +### Proof + +Every agent exposes a machine-readable proof document. It should be sufficient +for a verifier to answer: + +- which hardware-backed enclave am I talking to? +- who currently owns deploy authority? +- what workload source is claimed? +- what artifact digest is running? +- was the workload deployed by the owner principal? +- are mutation/debug capabilities enabled? + +Draft shape: + +```json +{ + "service": "dd-agent", + "agent_id": "dd-agent-...", + "hostname": "agent.example.com", + "status": "healthy", + "owner": { + "kind": "repo", + "name": "example/oracle", + "id": 123456789 + }, + "assignment": { + "claim_id": "lease_abc123", + "assigned_at": "2026-04-28T00:00:00Z" + }, + "attestation": { + "type": "tdx", + "quote_b64": "...", + "mrtd": "...", + "tcb_status": "UpToDate" + }, + "workloads": [ + { + "app_name": "oracle", + "source_repo": "example/oracle", + "source_ref": "refs/tags/v1.2.3", + "source_commit": "...", + "artifact_digest": "sha256:...", + "spec_digest": "sha256:...", + "deployed_by": { + "repository": "example/oracle", + "repository_id": 123456789, + "workflow": "deploy.yml" + } + } + ], + "capabilities": { + "runtime_deploy": true, + "exec": false, + "interactive_shell": false, + "logs": true + } +} +``` + +## API Surface + +### Agent API + +The agent should remain small. + +| Endpoint | Purpose | Auth | +| --- | --- | --- | +| `GET /health` | public proof and liveness | none | +| `POST /owner` | assign current owner | assignment authority OIDC | +| `POST /deploy` | deploy workload spec | current owner OIDC | +| `GET /logs/{app}` | read workload logs | current owner OIDC | +| `POST /exec` | optional debug command | current owner OIDC + capability enabled | + +`POST /owner` is explicitly safe to call repeatedly. Same requested owner and +same `claim_id` should return success without changing runtime state. + +### Control Plane API + +The control plane is optional coordination, not core trust. + +Responsibilities: + +- discover agents +- reconcile desired assignments +- provide dashboards +- expose lease/claim state +- route traffic +- collect proof documents + +An agent should still be understandable as a standalone product primitive. + +## GitHub Actions Model + +Each workload repository owns its deploy workflow: + +```yaml +permissions: + id-token: write + contents: read + +steps: + - uses: actions/checkout@v4 + - uses: devopsdefender/dd-action/deploy@v2 + with: + agent: https://agent.example.com + workload: workload.json +``` + +The action should: + +1. mint a GitHub Actions OIDC token +2. resolve or validate the target agent +3. verify the agent's proof document +4. submit the workload spec +5. wait for the proof document to reflect the deployment + +## Invariants + +- Every deployable agent has one current owner. +- Current owner is the only deploy authority. +- Assignment authority is separate from deploy authority. +- Assignment is idempotent and externally reconciled. +- Workload code is open source. +- Verification does not require trusting DD marketing, dashboards, or CI logs. +- DD infrastructure examples are not the product boundary. + +## Non-Goals + +- DD v2 is not a general Kubernetes replacement. +- DD v2 is not a secret-bearing CI system. +- DD v2 does not require workloads to live in the DD repo. +- DD v2 does not make VM-local ownership persistence authoritative. +- DD v2 does not require a central hosted control plane for self-hosted agents. diff --git a/docs/threat-model-v2.md b/docs/threat-model-v2.md new file mode 100644 index 0000000..83d42b1 --- /dev/null +++ b/docs/threat-model-v2.md @@ -0,0 +1,181 @@ +# DD v2 Threat Model + +DD v2 separates three roles that the current codebase tends to blend together: + +- **operator**: runs hardware, networking, billing, and assignment automation +- **owner**: current GitHub principal allowed to deploy workloads +- **verifier**: user or protocol participant checking what is running + +The design goal is not to eliminate operator power over infrastructure. It is +to make workload authority and runtime proof explicit enough that a verifier can +detect the relevant trust boundary. + +## Assets + +- current owner principal +- assignment claim history +- workload source identity +- workload artifact digest +- TDX quote and measurements +- agent signing/noise key material +- runtime logs and output +- ingress routes for workload endpoints + +## Trust Boundaries + +### Operator Boundary + +The operator can: + +- start and stop machines +- replace a VM +- route or de-route hostnames +- assign an agent if authorized by the assignment system +- observe public proof documents and exposed traffic metadata + +The operator should not be able to: + +- deploy a workload unless they are also current owner +- silently impersonate a repo owner through GitHub OIDC +- make a verifier accept a workload without matching proof +- mutate owner state without an assignment event + +### Owner Boundary + +The owner can: + +- deploy workloads from matching GitHub Actions OIDC claims +- read logs if the agent exposes logs +- use debug routes only if the agent capability set allows it + +The owner should not be able to: + +- change assignment authority +- claim ownership of other agents +- forge attestation or artifact provenance + +### Verifier Boundary + +The verifier can: + +- fetch `GET /health` +- verify TDX attestation +- compare owner principal to expected GitHub identity +- compare workload source, ref, and digest to expected open-source code +- reject agents with unsafe capabilities + +The verifier should not need: + +- DD repo write access +- Cloudflare dashboard access +- SSH to the host +- trust in CI log screenshots + +## Primary Risks + +### Name Squatting + +Risk: GitHub login or repo name changes hands. + +Mitigation: + +- principal matching requires numeric GitHub IDs +- proof documents expose IDs, not just names +- assignment records store IDs + +### Reboot Loses Owner + +Risk: runtime owner disappears after VM reboot. + +Mitigation: + +- this is expected behavior +- assignment is desired state outside the VM +- reconcilers repeatedly call `POST /owner` +- deploy workflows wait for `health.owner == expected` + +### Operator Deploys Customer Workload + +Risk: managed operator deploys or changes workload after assignment. + +Mitigation: + +- deploy endpoint accepts current owner only +- operator assignment authority is not deploy authority +- proof exposes `deployed_by` GitHub claims + +### Runtime Debug Invalidates Confidentiality + +Risk: `/exec`, terminal, or mutable deployment channels undermine oracle claims. + +Mitigation: + +- proof exposes capability flags +- oracle profiles should disable `exec` and interactive shell +- verifier policy rejects unexpected capabilities + +### Closed-Source or Ambiguous Artifact + +Risk: verifier cannot map runtime artifact to public source. + +Mitigation: + +- workload proof includes repo/ref/commit/artifact digest +- deploy action should attach build provenance +- examples and docs require OSS repos for production workloads + +### Stale or Forked Agent + +Risk: old code keeps serving a convincing dashboard. + +Mitigation: + +- verifier checks TDX quote and measurement policy +- proof schema is machine-readable +- deployment action verifies proof before and after deploy + +## Capability Profiles + +### Coding Agent + +Expected capabilities: + +- `runtime_deploy: true` +- `logs: true` +- `exec: optional` +- `interactive_shell: optional` + +Verifier posture: user trusts their own assigned agent but still wants +attestation and source provenance. + +### Bot Agent + +Expected capabilities: + +- `runtime_deploy: true` +- `logs: true` +- `exec: false` by default +- `interactive_shell: false` by default + +Verifier posture: project maintainers check assignment and deployment source. + +### Oracle Agent + +Expected capabilities: + +- `runtime_deploy: false` after boot or controlled redeploy only +- `logs: limited` +- `exec: false` +- `interactive_shell: false` + +Verifier posture: third parties reject mutable/debuggable agents unless the +oracle protocol explicitly permits those capabilities. + +## Security Invariants + +- Assignment authority and workload authority are separate. +- Current owner is the only deploy authority. +- GitHub principal IDs are mandatory. +- Reassignment is idempotent and auditable. +- Proof is public and machine-readable. +- Confidentiality claims are tied to capabilities, not branding. diff --git a/index.html b/index.html deleted file mode 100644 index c45f063..0000000 --- a/index.html +++ /dev/null @@ -1,261 +0,0 @@ - - - - - - DevOps Defender - Confidential MCP Servers and Oracles - - - - - - - -
-

Confidential apps, attestable by design

-

Enclave computing is hard. EasyEnclave makes it easy. It's a drop-in replacement for a Linux distribution inside Intel TDX VMs — no Go, no systemd, no package manager. From this pristine confidential base, install an open-source MCP server or crypto oracle whose code can be attested, or install a Noise + LLM confidential endpoint for a private bot with durable history. Verify either path via the TDX quote on /health.

- -
-
EE PristineConfidential base image
-
MCP ReadyAttestable tool servers
-
Noise + LLMPrivate bot endpoints
-
-
- -
-
-

How It Works

-

Three steps from code to a private, remotely verifiable app.

-
-
-
1
-
-

Start pristine

-

Boot a TDX VM where EasyEnclave replaces the Linux distribution: a single Rust PID 1, no systemd, no Go runtime, no apt or yum. Sealed memory, no tenant deploy authority, an empty taint profile — the smallest measurable surface that still runs your apps.

-
-
-
-
2
-
-

Install one shape

-

Choose an attestable open-source workload, such as an MCP server or crypto oracle, or choose a sealed Noise + LLM endpoint for an always-on private bot. Both are ordinary workload.json apps; the difference is who has runtime authority after boot.

-
-
-
-
3
-
-

Attest & use

-

Intel TDX seals guest memory; the agent mints a quote and the CP verifies it via Intel Trust Authority at /register. Clients can check the quote, taint reasons, and workload identity before sending secrets, prompts, keys, or trading logic.

-
-
-
-
-
- -
-
-

Use Cases

-

Confidential apps where the code, history, and operator boundary matter.

-
-
-
🧰
-

Attestable MCP Servers

-

Run a Model Context Protocol server from open-source code in a measured VM. A client can verify the server binary, boot config, and taint profile before granting tools, files, or credentials.

-
-
-
-

Crypto Oracles

-

Publish oracle logic as a public repo and run it sealed. Users verify that the exact oracle code is live before relying on signed market data, settlement decisions, or automated treasury actions.

-
-
-
🔒
-

Confidential Bots

-

Install a Noise + LLM endpoint that keeps conversation history inside the enclave. The bot can remember useful context while the operator still cannot read the history or swap the runtime after boot.

-
-
-
📊
-

Fleet Metrics

-

CP dashboard scrapes /health on every agent: CPU, memory, per-disk capacity, per-NIC rx/tx, ITA attestation status, workload identity, and taint reasons.

-
-
-
🚀
-

Workloads as JSON

-

Every app — cloudflared, dd-agent, MCP servers, oracles, LLM endpoints, yours — is one file at apps/<name>/workload.json. Boot workloads and runtime deploys share the same schema.

-
-
-
🛡
-

EasyEnclave Runtime

-

EasyEnclave replaces your Linux distribution inside the sealed VM — an open-source Rust PID 1 with a Unix socket API. No systemd, no Go, no networking in the runtime. Enclave computing was hard; this is the boring base layer that makes it easy.

-
-
-
-

Sats for Compute

-

The canonical example operator: pay BTC, get a fresh attested node bound to your GitHub identity, or a sealed oracle from your public workload repo. State lives in GitHub issues; workflow_dispatch is the actuator. satsforcompute.com.

-
-
-
🌐
-

Cloudflare Tunnels

-

Every agent gets a tunnel hostname. No public IPs, no firewall rules, no port forwarding. The CP provisions tunnels automatically on registration.

-
-
-
📜
-

Signed Releases

-

Every devopsdefender binary CI publishes carries a Sigstore-backed GitHub build attestation. gh attestation verify proves a binary came from this repo's release workflow — provable provenance, not trust us.

-
-
-
-
- -
-
-

Two install paths

-

Same pristine EasyEnclave substrate, two product shapes — chosen at boot, provable from the TDX quote.

-
-
- attestable open source -

MCP servers and oracles

-

Install a workload from open-source code whose repo, release, and boot config are part of the evidence. Clients verify the TDX quote and /health before they trust the MCP tools, oracle outputs, private keys, API credentials, or trading logic.

-

For: MCP tool servers, crypto price oracles, settlement bots, verifiable automation.

-
-
- confidential endpoint -

Noise + LLM private bot

-

Install a sealed endpoint that speaks Noise to the client and keeps prompts, memory, and chat history inside the enclave. The agent boots with DD_CONFIDENTIAL=true/deploy, /exec, and /owner are not registered on this node. Logs and attestation stay open; operator mutation paths stay closed.

-

For: private assistants, trading bots, support agents, and long-lived agents that need history without exposing it to the host.

-
-
-
-
- -
-
-

Trust model

-

EasyEnclave is the pristine confidential root. Taint is a set of reasons, not a boolean. Read /health.taint_reasons + the TDX quote and reconstruct the trust profile in one fetch.

- - - - - - - - - - - - - - - - - -
pristineBooted from a known EasyEnclave image. No customer has had deploy / exec / shell authority. taint_reasons set is empty.
taintedCustomer-influenced via at least one channel. Reasons surface which:
-
    -
  • customer_workload_deployed — a /deploy succeeded since boot
  • -
  • customer_owner_enabled/owner set a non-fleet tenant
  • -
  • arbitrary_exec_enabled — node booted with /deploy + /exec registered (i.e. not confidential mode)
  • -
  • interactive_shell_enabled — ttyd or equivalent in the running workload
  • -
-
safe_modeOn reboot, the agent's runtime owner clears and the node returns to bot/DD control. It may still be tainted; safe_mode is not the same as pristine. A tainted node is rebuilt before reassignment.
-

Confidential-mode nodes ship with the workload measured at boot and no runtime mutation endpoints. The absence of arbitrary_exec_enabled + interactive_shell_enabled is the cryptographic signal that the operator cannot tamper with the running MCP server, oracle, or private bot.

-
-
- -
-
-

Architecture

-

1 binary, 2 install paths, workloads as code.

-
-
-Customer (browser)
-  |
-  v
-Cloudflare Edge ──── tunnel ────> TDX VM
-                                    |
-                            easyenclave (PID 1)
-                            |
-                            └── spawns confidential apps from apps/*/workload.json
-                                ├── cloudflared           (fetch-only, gives us a tunnel)
-                                ├── devopsdefender agent  (/health, /deploy, /exec)
-                                ├── podman                (static, rootful, daemon-less)
-                                ├── mcp server             (attestable open-source tools)
-                                ├── crypto oracle          (sealed decision logic)
-                                └── Noise + LLM endpoint   (private bot + history)
-
-devopsdefender cp (fleet dashboard + management)
-  ├── discovers agents via CF tunnels
-  ├── scrapes /health (per-disk + per-NIC metrics)
-  ├── verifies each agent's ITA quote at /register
-  └── web UI: fleet table, per-agent detail, in-browser shell
-
-
-
-
- -
-
-

Deploy with GitHub Actions

-

Per-job OIDC token, no stored credentials. Any workflow in the DD GitHub org deploys — nothing else does.

-
-
.github/workflows/deploy.yml
-
jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write   # mints the OIDC token
-      contents: read
-    steps:
-      - uses: actions/checkout@v4
-      - uses: devopsdefender/dd/.github/actions/dd-deploy@main
-        with:
-          cp-url: https://app.devopsdefender.com
-          vm-name: dd-local-prod
-          workload: apps/myapp/workload.json
-
-

The agent verifies the GitHub Actions OIDC JWT in-code against GitHub's JWKS, checks repository_owner == DD_OWNER, and launches the workload. No service tokens, no PATs, no secrets stored anywhere.

-
-
- -
-
-
-

Powered by EasyEnclave

-

A Linux distribution replacement for confidential VMs. Enclave computing is hard; EasyEnclave makes it easy. One Rust binary as PID 1 inside Intel TDX — no systemd, no Go, no package manager, no shell to attack. Unix socket API. No networking in the runtime. DevOps Defender layers attestable app installation, fleet health, and confidential endpoint patterns on top.

- Learn More -
-
-
- -
-

Run private tools people can verify.

-

Attestable MCP servers, sealed crypto oracles, and confidential bots with memory.

- -
- - - - - diff --git a/src/agent.rs b/src/agent.rs deleted file mode 100644 index 4c4367e..0000000 --- a/src/agent.rs +++ /dev/null @@ -1,969 +0,0 @@ -//! Agent mode — runs inside an easyenclave TDX VM. -//! -//! On startup: POST `{vm_name, env_label, owner, ita_token}` to -//! `$DD_CP_URL/register` (no auth — ITA attestation is the gate; -//! the path is exempt from CF Access via a bypass app). The CP -//! responds with `{tunnel_token, hostname, agent_id, cp_hostname}`. -//! -//! Auth after registration: -//! - Browser routes (`/`, `/workload/*`) are behind CF Access with -//! the same human policy as the CP dashboard. -//! - Terminal is a separate `ttyd` workload published on -//! `block.` — a plain web shell, not tied to any -//! deployment. -//! - `/deploy` and `/exec` are CF-Access-bypassed and gated in-code -//! by a GitHub Actions OIDC token — any CI workflow whose -//! principal matches `DD_OWNER`/`DD_OWNER_ID`/`DD_OWNER_KIND` -//! (see [`gh_oidc::Principal::matches`]) can call them by -//! presenting its per-job OIDC JWT as `Authorization: Bearer …`. -//! - Agent → CP `/ingress/replace` calls include the agent's fresh -//! ITA token in the body; the CP verifies it against Intel. - -use std::sync::Arc; -use std::time::{Duration, Instant}; - -use axum::extract::{Path, State}; -use axum::http::HeaderMap; -use axum::response::{Html, IntoResponse, Response}; -use axum::routing::{get, post}; -use axum::{Json, Router}; -use base64::Engine as _; -use serde::Deserialize; -use tokio::sync::RwLock; - -use crate::config::Agent as Cfg; -use crate::ee::Ee; -use crate::error::{Error, Result}; -use crate::gh_oidc; -use crate::html::{self, shell}; -use crate::ita; -use crate::metrics; -use crate::noise_gateway; -use crate::taint::{TaintReason, TaintSet}; - -/// Re-mint interval. Intel ITA tokens typically expire in a few -/// minutes; refresh well before so `/health` always serves a live -/// token to the CP's collector. -const ITA_REFRESH: Duration = Duration::from_secs(180); - -/// Poll interval for syncing the device trust list from the CP. -/// Tuned so a revoke propagates within ~30s. -const DEVICES_POLL: Duration = Duration::from_secs(30); - -#[derive(Clone)] -struct St { - cfg: Arc, - ee: Arc, - hostname: String, - /// Tunnel name returned by the CP at /register — stable for the - /// life of this agent's tunnel. The /ingress/replace call on the - /// CP keys off this to look up the tunnel_id. - agent_id: String, - started: Instant, - /// Current Intel-signed JWT. Refreshed by a background task. - ita_token: Arc>, - /// Live set of per-workload ingress rules this agent has asked - /// the CP to publish. Seeded from boot `cfg.extra_ingress`; - /// appended each time a POSTed workload declares `expose`. The - /// agent forwards the full list on every /ingress/replace call - /// so the CP's PUT is a straight replacement. - extras: Arc>>, - /// Verifier for GitHub Actions OIDC JWTs — the auth on /deploy - /// and /exec. CI workflows whose principal matches - /// `DD_OWNER`/`DD_OWNER_ID`/`DD_OWNER_KIND` can call them - /// without any shared secret; anyone else is denied at claim - /// check. - gh: Arc, - /// Runtime tenant-owner set via `POST /owner`. When `Some(p)`, - /// `/deploy` / `/exec` / `/logs` accept GitHub OIDC from EITHER - /// the fleet principal OR `p` — shared admin. The `/owner` - /// endpoint itself is gated on fleet-only auth. Reset to `None` - /// on every agent boot (no persistence); the s12e bot reapplies - /// via `/owner` if the claim is still active after a restart. - agent_owner: Arc>>, - /// TDX-quote + Noise-static-pubkey bundle. Served as - /// `{ noise: { quote_b64, pubkey_hex } }` off `/health` so a - /// bastion-app can bootstrap a Noise session in one fetch (used - /// to be a separate `/attest` endpoint — folded in here). Shared - /// `Arc` with the Noise gateway module's handshake responder; - /// one keypair / one quote per boot. - attest: Arc, - /// Integrity taint-reason set. Seeded at boot (ArbitraryExecEnabled - /// when mutation routes are registered) and appended at runtime - /// as events happen (`/owner` → CustomerOwnerEnabled, `/deploy` - /// ok → CustomerWorkloadDeployed). Mirrored in `/health`. - taint: TaintSet, -} - -pub async fn run() -> Result<()> { - let cfg = Arc::new(Cfg::from_env()?); - let ee = Arc::new(Ee::new(&cfg.ee_socket)); - - let h = ee.health().await?; - eprintln!( - "agent: EE connected (attestation={})", - h["attestation_type"].as_str().unwrap_or("?") - ); - - let initial_token = mint_ita(&cfg, &ee).await?; - eprintln!("agent: ITA token minted"); - - eprintln!("agent: registering with {}", cfg.cp_url); - let b = register(&cfg, &initial_token).await?; - eprintln!("agent: registered as {}", b.hostname); - - spawn_cloudflared(b.tunnel_token); - - let ita_token = Arc::new(RwLock::new(initial_token)); - - // Background re-mint so /health always serves a non-expired token - // for the CP's scrape-and-verify loop. - { - let cfg = cfg.clone(); - let ee = ee.clone(); - let token = ita_token.clone(); - tokio::spawn(async move { - loop { - tokio::time::sleep(ITA_REFRESH).await; - match mint_ita(&cfg, &ee).await { - Ok(t) => { - *token.write().await = t; - eprintln!("agent: ITA token refreshed"); - } - Err(e) => { - eprintln!("agent: ITA refresh failed (keeping stale token): {e}"); - } - } - } - }); - } - - // Noise gateway runs in-process too: this agent serves the - // pre-handshake bundle inline on /health (`.noise.quote_b64` + - // `.noise.pubkey_hex`) and the Noise_IK responder on /noise/ws, - // both on the same port 8080 as everything else, so bastion-app - // CLIs can attach directly to the agent's EE instance without - // going through the CP. - let trust = noise_gateway::new_trust_handle(); - - // Background poll for the device trust list. Mutates `trust` - // in place so the local Noise responder picks up revocations - // within ~DEVICES_POLL. - { - let cp_url = cfg.cp_url.clone(); - let token = ita_token.clone(); - let trust = trust.clone(); - tokio::spawn(async move { - let http = reqwest::Client::new(); - loop { - if let Err(e) = sync_trusted_devices(&http, &cp_url, &token, &trust).await { - eprintln!("agent: device sync failed: {e}"); - } - tokio::time::sleep(DEVICES_POLL).await; - } - }); - } - - // Attestation keypair + upstream EE client for the Noise gateway. - let noise_key_path: std::path::PathBuf = std::env::var("DD_NOISE_KEY_PATH") - .unwrap_or_else(|_| "/run/devopsdefender/noise.key".into()) - .into(); - let attestor = Arc::new( - noise_gateway::attest::Attestor::load_or_mint(&noise_key_path) - .await - .map_err(|e| Error::Internal(format!("noise keypair: {e}")))?, - ); - eprintln!("agent: noise_pubkey={}", hex::encode(attestor.public_key())); - let ee_token = std::env::var("EE_TOKEN").ok(); - let upstream = Arc::new(noise_gateway::upstream::EeAgent::new( - std::path::PathBuf::from(noise_gateway::upstream::DEFAULT_EE_AGENT_SOCK), - ee_token, - )); - let ng_state = noise_gateway::State { - attest: attestor.clone(), - trust, - upstream, - }; - - let gh = gh_oidc::Verifier::new(cfg.common.owner.clone(), "dd-agent".into()); - - // Seed taint set. Boot-time facts go in now; runtime events - // (CustomerOwnerEnabled, CustomerWorkloadDeployed) are appended - // by their respective handlers as they happen. - let mut boot_taint: Vec = Vec::new(); - if !cfg.confidential { - boot_taint.push(TaintReason::ArbitraryExecEnabled); - } - let taint = TaintSet::with_initial(boot_taint); - - let state = St { - cfg: cfg.clone(), - ee, - hostname: b.hostname, - agent_id: b.agent_id, - started: Instant::now(), - ita_token, - extras: Arc::new(RwLock::new(cfg.extra_ingress.clone())), - gh, - attest: attestor, - agent_owner: Arc::new(RwLock::new(None)), - taint, - }; - - // Confidential mode: `/deploy`, `/exec`, and `/owner` are not - // registered at all — they 404 rather than 401. Attestation + - // the taint-reason set (see /health) prove to third parties - // that these mutation channels are absent, without requiring - // trust in the agent's HTTP response ("disabled? really?"). - // `/logs` stays available so observers can still stream output - // from the sealed workload. - let mut app = Router::new() - .route("/", get(dashboard)) - .route("/health", get(health)) - .route("/workload/{id}", get(workload_page)) - .route("/logs/{app}", get(logs)); - if !cfg.confidential { - app = app - .route("/deploy", post(deploy)) - .route("/exec", post(exec)) - .route("/owner", post(set_owner)); - } - let app = app - .fallback(log_unmatched) - .with_state(state) - .merge(noise_gateway::router(ng_state)) - // Wire-level request log. Fires for every HTTP request the - // listener accepts — strictly before any extractor runs and - // strictly after any handler returns, so it draws a line - // between "request reached axum" and "request died in - // CF/cloudflared before us" (the `/deploy` 2xx-empty-body - // bug from GH Actions runners). One line in, one line out, - // per request — cheap enough to keep on in prod. - .layer(axum::middleware::from_fn(log_http)); - - let addr = format!("0.0.0.0:{}", cfg.common.port); - eprintln!("agent: listening on {addr}"); - let listener = tokio::net::TcpListener::bind(&addr).await?; - axum::serve( - listener, - app.into_make_service_with_connect_info::(), - ) - .await - .map_err(|e| Error::Internal(e.to_string())) -} - -/// Logs one line per inbound HTTP request before any extractor runs, -/// and one line for the final response status. Primary motivation: -/// the `/deploy` handler's "entered" eprintln never fires for the -/// empty-2xx failures from GH runners, leaving ambiguous whether -/// the request ever crossed the CF+cloudflared boundary. This pins -/// down that boundary. -/// -/// Headers logged (presence or value) — chosen because each one -/// distinguishes a plausible failure mode: -/// - Content-Type + Content-Length: whether the body extractor -/// (`Json`) would accept/reject on arrival. -/// - Authorization (presence only, never the token): auth vs. -/// no-auth path. -/// - User-Agent: GH runners' curl leaves a distinct UA; a rewrite -/// upstream would show here. -/// - CF-Ray + CF-Connecting-IP: cross-reference with CF edge logs -/// — ground truth for "did CF see this and from where?" -async fn log_http( - req: axum::http::Request, - next: axum::middleware::Next, -) -> Response { - let method = req.method().clone(); - let path = req.uri().path().to_string(); - let h = req.headers(); - let get = |k: &str| { - h.get(k) - .and_then(|v| v.to_str().ok()) - .unwrap_or("-") - .to_string() - }; - let auth = if h.contains_key(axum::http::header::AUTHORIZATION) { - "yes" - } else { - "no" - }; - eprintln!( - "agent: IN {method} {path} auth={auth} ct={} cl={} ua={} cf-ray={} cf-ip={}", - get("content-type"), - get("content-length"), - get("user-agent"), - get("cf-ray"), - get("cf-connecting-ip"), - ); - let res = next.run(req).await; - eprintln!("agent: OUT {method} {path} -> {}", res.status().as_u16()); - res -} - -/// Pull the CP's device registry (`{"pubkeys": ["", ...]}`) and -/// atomically replace the local `TrustHandle`. The local Noise -/// responder reads this set directly; revocations propagate within -/// one `DEVICES_POLL` tick. -async fn sync_trusted_devices( - http: &reqwest::Client, - cp_url: &str, - ita_token: &Arc>, - trust: &noise_gateway::TrustHandle, -) -> Result<()> { - // `/api/v1/devices/trusted` is CF-Access-bypassed (see - // `cf::provision_cp_access`) so cross-VM agents can reach it over - // the public tunnel. Auth is in-code: loopback / GH-OIDC / ITA, - // same three-way policy as `/api/agents`. - let url = format!("{}/api/v1/devices/trusted", cp_url.trim_end_matches('/')); - let token = ita_token.read().await.clone(); - let resp = http - .get(&url) - .bearer_auth(token) - .send() - .await - .map_err(|e| Error::Upstream(format!("devices GET {url}: {e}")))?; - if !resp.status().is_success() { - return Err(Error::Upstream(format!( - "devices GET {url} → {}", - resp.status() - ))); - } - let body: serde_json::Value = resp.json().await?; - let mut fresh: std::collections::HashSet<[u8; 32]> = std::collections::HashSet::new(); - if let Some(arr) = body["pubkeys"].as_array() { - for v in arr { - let Some(s) = v.as_str() else { continue }; - let Ok(bytes) = hex::decode(s) else { continue }; - if bytes.len() != 32 { - continue; - } - let mut k = [0u8; 32]; - k.copy_from_slice(&bytes); - fresh.insert(k); - } - } - *trust.write().await = fresh; - Ok(()) -} - -#[derive(Debug, serde::Deserialize)] -struct Bootstrap { - tunnel_token: String, - hostname: String, - agent_id: String, -} - -async fn register(cfg: &Cfg, ita_token: &str) -> Result { - let http = reqwest::Client::builder() - .timeout(Duration::from_secs(60)) - .build() - .unwrap_or_else(|_| reqwest::Client::new()); - let url = format!("{}/register", cfg.cp_url.trim_end_matches('/')); - let extra_ingress: Vec = cfg - .extra_ingress - .iter() - .map(|(label, port)| serde_json::json!({"hostname_label": label, "port": port})) - .collect(); - let body = serde_json::json!({ - "vm_name": cfg.common.vm_name, - "ita_token": ita_token, - "extra_ingress": extra_ingress, - }); - - // /register is CF-Access-bypassed; ITA attestation is the gate. - // The transport layer is retried — the agent VM often boots - // faster than CF edge propagation for a just-flipped CP CNAME - // or a just-reconnected cloudflared tunnel, and the first POST - // tends to fail with "error sending request" / 502 / 530. - // Exponential-ish backoff, ~90s total. - let mut last_err: Option = None; - for attempt in 1..=6u32 { - match http.post(&url).json(&body).send().await { - Ok(resp) if resp.status().is_success() => { - return Ok(resp.json().await?); - } - Ok(resp) => { - let s = resp.status(); - // 4xx from the CP is almost always a real config - // error (ITA invalid, etc.) — no point retrying. - if s.is_client_error() && s != reqwest::StatusCode::TOO_MANY_REQUESTS { - let b = resp.text().await.unwrap_or_default(); - return Err(Error::Upstream(format!("register {url} → {s}: {b}"))); - } - let b = resp.text().await.unwrap_or_default(); - last_err = Some(Error::Upstream(format!("register {url} → {s}: {b}"))); - } - Err(e) => { - // Print `{:?}` so the reqwest error chain (TLS, - // DNS, connect details) lands in the agent log - // instead of just the wrapper message. - last_err = Some(Error::Upstream(format!("register {url}: {e:?}"))); - } - } - eprintln!( - "agent: register attempt {attempt}/6 failed ({}) — backing off", - last_err.as_ref().map(|e| e.to_string()).unwrap_or_default() - ); - tokio::time::sleep(Duration::from_secs(5 * attempt as u64)).await; - } - Err(last_err.unwrap_or_else(|| Error::Upstream("register: exhausted retries".into()))) -} - -/// Mint an Intel-signed TDX attestation JWT. Fatal on any failure — -/// the agent refuses to start without a valid token. -async fn mint_ita(cfg: &Cfg, ee: &Ee) -> Result { - use base64::Engine; - let nonce = base64::engine::general_purpose::STANDARD.encode(uuid::Uuid::new_v4().as_bytes()); - let quote_b64 = ee.attest(&nonce).await?["quote_b64"] - .as_str() - .ok_or_else(|| Error::Upstream("EE attest returned no quote_b64".into()))? - .to_string(); - ita::mint(&cfg.ita.base_url, &cfg.ita.api_key, "e_b64).await -} - -fn spawn_cloudflared(token: String) { - tokio::spawn(async move { - eprintln!("agent: spawning cloudflared"); - match tokio::process::Command::new("cloudflared") - .args([ - "tunnel", - "--no-autoupdate", - "--metrics=", - "run", - "--token", - &token, - ]) - .spawn() - { - Ok(mut child) => { - let status = child.wait().await; - eprintln!("agent: cloudflared exited: {status:?}"); - std::process::exit(1); - } - Err(e) => { - eprintln!("agent: cloudflared spawn failed: {e}"); - std::process::exit(1); - } - } - }); -} - -/// 404 with a log line. Without this, a request to a path nobody -/// registered (e.g. caused by a proxy rewrite, or a typo'd CI URL) -/// would silently get axum's default 404 — and on the dd-deploy -/// side, curl doesn't see a body, so the symptom looks like "empty -/// 200". Logging the unmatched method+path gives us ground truth -/// for whether a request reached dd-agent at all. -async fn log_unmatched( - method: axum::http::Method, - uri: axum::http::Uri, -) -> (axum::http::StatusCode, Json) { - eprintln!("agent: 404 {} {}", method, uri.path()); - ( - axum::http::StatusCode::NOT_FOUND, - Json(serde_json::json!({"code":"NOT_FOUND","message":"unmatched route"})), - ) -} - -// ── Routes ────────────────────────────────────────────────────────────── - -async fn health(State(s): State) -> Json { - let ee_health = s.ee.health().await.unwrap_or_default(); - let list = s.ee.list().await.unwrap_or_default(); - let deployments: Vec = list["deployments"] - .as_array() - .map(|a| { - a.iter() - .filter_map(|d| d["app_name"].as_str().map(String::from)) - .collect() - }) - .unwrap_or_default(); - let m = metrics::collect().await; - let ita_token = s.ita_token.read().await.clone(); - let agent_owner = s.agent_owner.read().await.clone(); - let taint_reasons = s.taint.snapshot().await; - let extra_ingress: Vec = s - .extras - .read() - .await - .iter() - .map(|(label, port)| serde_json::json!({"hostname_label": label, "port": port})) - .collect(); - - // Back-compat surface: pre-Principal /health consumers - // (satsforcompute's bot, owner-update.yml, anything else keying - // off the /health JSON) read `agent_owner` and `owner` as plain - // strings. Keep them strings (the principal `name`) and expose - // the structured form alongside as `*_principal` for new callers. - let fleet_owner_name = s.cfg.common.owner.name.clone(); - let agent_owner_name = agent_owner.as_ref().map(|p| p.name.clone()); - - Json(serde_json::json!({ - "ok": true, - "service": "agent", - "agent_id": s.agent_id, - "vm_name": s.cfg.common.vm_name, - "hostname": s.hostname, - // `owner` / `agent_owner`: strings, principal name only — - // back-compat for pre-Principal consumers. Structured form - // (with id and kind) is on the `*_principal` keys below. - "owner": fleet_owner_name, - "fleet_owner": fleet_owner_name, - "agent_owner": agent_owner_name, - "fleet_owner_principal": s.cfg.common.owner, - "agent_owner_principal": agent_owner, - // Integrity surface (SATS_FOR_COMPUTE_SPEC Integrity States). - // `confidential_mode`: boot-time flag; true → /deploy + /exec - // + /owner were NOT registered on this agent. Set from - // `DD_CONFIDENTIAL`. - // `taint_reasons`: current set, sorted for diff-friendliness. - // Empty set = pristine. v0: informational — DD doesn't block - // actions based on the set. - "confidential_mode": s.cfg.confidential, - "taint_reasons": taint_reasons, - "attestation_type": ee_health["attestation_type"].as_str().unwrap_or("unknown"), - "deployments": deployments, - "deployment_count": list["deployments"].as_array().map(|a| a.len()).unwrap_or(0), - "cpu_percent": m.cpu_pct, - "memory_used_mb": m.mem_used_mb, - "memory_total_mb": m.mem_total_mb, - "swap_used_mb": m.swap_used_mb, - "swap_total_mb": m.swap_total_mb, - "load_1m": m.load_1m, - "load_5m": m.load_5m, - "load_15m": m.load_15m, - "nets": m.nets, - "disks": m.disks, - "uptime_secs": s.started.elapsed().as_secs(), - "system_uptime_secs": m.uptime_secs, - "ita_token": ita_token, - "extra_ingress": extra_ingress, - // Pre-Noise-handshake bundle — stable per boot. Used to be a - // standalone `GET /attest` endpoint. Keeping it here lets a - // bastion-app bootstrap with one fetch and drops a CF Access - // bypass-app per env × per service. `quote_b64` binds the - // raw Noise pubkey into its TDX `report_data`; clients verify - // the Intel signature and pin the pubkey from the quote — no - // TOFU needed. - "noise": { - "quote_b64": base64::engine::general_purpose::STANDARD.encode(s.attest.quote()), - "pubkey_hex": hex::encode(s.attest.public_key()), - }, - })) -} - -async fn dashboard(State(s): State) -> Response { - let m = metrics::collect().await; - let list = s.ee.list().await.unwrap_or_default(); - let ee_health = s.ee.health().await.unwrap_or_default(); - let att = ee_health["attestation_type"].as_str().unwrap_or("unknown"); - - let deployments: Vec<&serde_json::Value> = list["deployments"] - .as_array() - .map(|a| a.iter().collect()) - .unwrap_or_default(); - - let mut rows = String::new(); - for d in &deployments { - let status = d["status"].as_str().unwrap_or("idle"); - let cls = match status { - "running" => "running", - "deploying" => "deploying", - "failed" | "exited" => "failed", - _ => "idle", - }; - let id = d["id"].as_str().unwrap_or(""); - let app = d["app_name"].as_str().unwrap_or("unnamed"); - let image = d["image"].as_str().unwrap_or(""); - rows.push_str(&format!( - r#"{app}{status}{image}logs"# - )); - } - - let table = if deployments.is_empty() { - r#"
No workloads running
"#.to_string() - } else { - format!( - r#"{rows}
appstatusimage
"# - ) - }; - - // `{hostname-base}-block.{tld}` is the ttyd subdomain provisioned - // at register time. Human-gated by CF Access. Flat shape so - // Universal SSL covers the cert. - let term_host = html::escape(&crate::cf::label_hostname(&s.hostname, "block")); - - let body = format!( - r#"

{hostname}

-
{vm} · {att}
-
healthy · uptime {up} · {count} workload(s) · Terminal ↗
-
-
CPU
{cpu}%
-
Memory
{mu} / {mt}
-
Load 1m
{load:.2}
-
-
Workloads
{table}"#, - term_host = term_host, - hostname = html::escape(&s.hostname), - vm = html::escape(&s.cfg.common.vm_name), - att = html::escape(att), - up = metrics::format_duration_secs(s.started.elapsed().as_secs()), - count = deployments.len(), - cpu = m.cpu_pct, - mu = metrics::format_bytes_mb(m.mem_used_mb), - mt = metrics::format_bytes_mb(m.mem_total_mb), - load = m.load_1m, - ); - - Html(shell( - &format!("DD — {}", s.cfg.common.vm_name), - &html::nav(&[("Dashboard", "/", true)]), - &body, - )) - .into_response() -} - -async fn workload_page(State(s): State, Path(id): Path) -> Result { - let list = s.ee.list().await?; - let deployments: Vec<&serde_json::Value> = list["deployments"] - .as_array() - .map(|a| a.iter().collect()) - .unwrap_or_default(); - let d = deployments - .iter() - .find(|d| d["id"].as_str() == Some(id.as_str())) - .ok_or(Error::NotFound)?; - let app = d["app_name"].as_str().unwrap_or("unnamed"); - let status = d["status"].as_str().unwrap_or("unknown"); - let image = d["image"].as_str().unwrap_or(""); - let started = d["started_at"].as_str().unwrap_or(""); - let error = d["error_message"].as_str().unwrap_or(""); - - let logs = s.ee.logs(&id).await.unwrap_or_default(); - let log_text = logs["lines"] - .as_array() - .map(|a| { - a.iter() - .filter_map(|v| v.as_str()) - .map(html::escape) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - - let err_row = if error.is_empty() { - String::new() - } else { - format!( - r#"
Error{}
"#, - html::escape(error) - ) - }; - - let body = format!( - r#" -

{app}

-
{id}
-
-
Status{status}
-
Image{image}
-
Started{started}
- {err_row} -
-
Logs
-
{logs}
"#, - app = html::escape(app), - id = html::escape(&id), - cls = match status { - "running" => "running", - "deploying" => "deploying", - "failed" | "exited" => "failed", - _ => "idle", - }, - status = html::escape(status), - image = html::escape(image), - started = html::escape(started), - err_row = err_row, - logs = if log_text.is_empty() { - "No logs".into() - } else { - log_text - }, - ); - - Ok(Html(shell( - &format!("DD — {app}"), - &html::nav(&[("Dashboard", "/", false)]), - &body, - )) - .into_response()) -} - -/// Extract the `Authorization: Bearer ` header and return the -/// trimmed token body. Shared by fleet-only and fleet-or-agent auth -/// paths so the Bearer-parsing shape stays consistent. -fn bearer_token(headers: &HeaderMap) -> Result<&str> { - let auth = headers - .get(axum::http::header::AUTHORIZATION) - .and_then(|v| v.to_str().ok()) - .ok_or(Error::Unauthorized)?; - auth.strip_prefix("Bearer ") - .or_else(|| auth.strip_prefix("bearer ")) - .map(str::trim) - .filter(|t| !t.is_empty()) - .ok_or(Error::Unauthorized) -} - -/// Workload-control auth (`/deploy`, `/exec`, `/logs`): accept a -/// GitHub Actions OIDC token whose principal (per -/// [`gh_oidc::Principal::matches`]) is EITHER the fleet owner -/// (`DD_OWNER`/`DD_OWNER_ID`/`DD_OWNER_KIND`) OR the agent's runtime -/// `agent_owner`. Shared admin — ops and the active tenant both -/// have deploy/exec/logs authority. -async fn require_gh_oidc(s: &St, headers: &HeaderMap) -> Result { - let token = bearer_token(headers)?; - let agent_owner = s.agent_owner.read().await.clone(); - s.gh.verify_allowing(token, agent_owner.as_ref()).await -} - -/// Fleet-only auth. Used by `/owner`, which re-assigns the tenant: -/// only ops (the fleet principal) may call it, never the tenant -/// themselves. -async fn require_fleet_oidc(s: &St, headers: &HeaderMap) -> Result { - let token = bearer_token(headers)?; - s.gh.verify(token).await -} - -async fn deploy( - State(s): State, - headers: HeaderMap, - Json(spec): Json, -) -> Result> { - // Log entry *before* auth so we can tell CF-Access-intercepts - // (no handler entry at all) from OIDC failures (entry + reject). - eprintln!( - "agent: /deploy entered (has_auth={}, app={})", - headers.contains_key(axum::http::header::AUTHORIZATION), - spec.get("app_name").and_then(|v| v.as_str()).unwrap_or("?") - ); - let claims = require_gh_oidc(&s, &headers).await?; - eprintln!( - "agent: /deploy by {} (repo={}, ref={})", - claims.sub, claims.repository, claims.ref_ - ); - - // Pull `expose` off the spec before forwarding to EE. EE ignores - // unknown fields today but keeping the payload tidy avoids future - // surprises if EE ever grows stricter parsing. - let expose = parse_expose(&spec); - - let response = s.ee.deploy(spec).await?; - - // Once a runtime deploy has succeeded, the node is not pristine - // anymore. Set idempotently — only the first successful /deploy - // per boot actually inserts. - s.taint.insert(TaintReason::CustomerWorkloadDeployed).await; - - if let Some((label, port)) = expose { - if let Err(e) = push_extra_ingress(&s, label.clone(), port).await { - // Soft-fail: the workload is deployed, the owner just can't - // reach it from the public internet yet. Better than failing - // the whole /deploy and leaving the caller unsure whether - // the process is running. - eprintln!( - "agent: /ingress/replace add {label}:{port} failed (workload still running): {e}" - ); - } - } - - Ok(Json(response)) -} - -/// Extract `expose.hostname_label` + `expose.port` from a DeployRequest -/// JSON body. Returns None if the field is missing or malformed; the -/// caller treats that as "no runtime ingress requested" and moves on. -fn parse_expose(spec: &serde_json::Value) -> Option<(String, u16)> { - let expose = spec.get("expose")?; - let label = expose.get("hostname_label")?.as_str()?.to_string(); - let port = expose.get("port")?.as_u64()?; - if label.is_empty() || port == 0 || port > u16::MAX as u64 { - return None; - } - Some((label, port as u16)) -} - -/// Append `(label, port)` to the live extras list (dedup by label — -/// redeploying the same app_name with the same hostname_label is a -/// no-op, not a duplicate rule) and POST the full list to the CP's -/// /ingress/replace endpoint. The CP re-PUTs the tunnel config and -/// upserts CNAMEs. -async fn push_extra_ingress(s: &St, label: String, port: u16) -> Result<()> { - let extras = { - let mut guard = s.extras.write().await; - if let Some(existing) = guard.iter_mut().find(|(l, _)| *l == label) { - existing.1 = port; - } else { - guard.push((label, port)); - } - guard.clone() - }; - - let body_extras: Vec = extras - .iter() - .map(|(l, p)| serde_json::json!({"hostname_label": l, "port": p})) - .collect(); - let ita_token = s.ita_token.read().await.clone(); - let body = serde_json::json!({ - "agent_id": s.agent_id, - "ita_token": ita_token, - "extras": body_extras, - }); - - let url = format!("{}/ingress/replace", s.cfg.cp_url.trim_end_matches('/')); - let resp = reqwest::Client::new() - .post(&url) - .json(&body) - .send() - .await - .map_err(|e| Error::Upstream(format!("ingress/replace {url}: {e}")))?; - if !resp.status().is_success() { - let status = resp.status(); - let text = resp.text().await.unwrap_or_default(); - return Err(Error::Upstream(format!( - "ingress/replace {url} → {status}: {text}" - ))); - } - eprintln!("agent: ingress/replace ok ({} extras total)", extras.len()); - Ok(()) -} - -#[derive(Debug, Deserialize)] -struct ExecReq { - cmd: Vec, - #[serde(default = "default_exec_timeout")] - timeout_secs: u64, -} -fn default_exec_timeout() -> u64 { - 60 -} - -/// GET /logs/{app} — look up the EE deployment id for `app` and -/// return EE's captured stdout. Gated by GH OIDC, same as /deploy -/// and /exec. 404 if no deployment with that `app_name` exists — -/// not a server error, callers often probe for optional workloads. -async fn logs( - State(s): State, - Path(app): Path, - headers: HeaderMap, -) -> Result> { - let _ = require_gh_oidc(&s, &headers).await?; - let list = s.ee.list().await?; - let id = list["deployments"] - .as_array() - .and_then(|a| { - a.iter() - .find(|d| d["app_name"].as_str() == Some(app.as_str())) - }) - .and_then(|d| d["id"].as_str()) - .map(String::from) - .ok_or(Error::NotFound)?; - Ok(Json(s.ee.logs(&id).await?)) -} - -async fn exec( - State(s): State, - headers: HeaderMap, - Json(req): Json, -) -> Result> { - let _ = require_gh_oidc(&s, &headers).await?; - Ok(Json(s.ee.exec(&req.cmd, req.timeout_secs).await?)) -} - -#[derive(Debug, Deserialize)] -struct OwnerReq { - /// New tenant principal name. Empty string clears the runtime - /// owner — after which `/deploy`/`/exec`/`/logs` accept only - /// the fleet owner again. When non-empty, `agent_owner_id` and - /// `agent_owner_kind` are required and validated for shape - /// consistency (see [`gh_oidc::Principal::from_parts`]). - agent_owner: String, - /// Numeric GitHub id of the principal. Required when - /// `agent_owner` is non-empty; ignored when clearing. - #[serde(default)] - agent_owner_id: u64, - /// `"user" | "org" | "repo"`. Required when `agent_owner` is - /// non-empty. - #[serde(default)] - agent_owner_kind: String, - /// Opaque ID from the caller's claim system (e.g. the s12e bot's - /// claim issue). Logged for audit; the agent doesn't interpret it. - #[serde(default)] - claim_id: String, -} - -/// POST /owner — set (or clear) the agent's runtime tenant owner. -/// Fleet-gated: only the fleet principal can reassign a node. -/// Runtime-only state: resets to `None` on reboot, so a crash/restart -/// is self-healing (the bot re-applies if the claim is still active). -async fn set_owner( - State(s): State, - headers: HeaderMap, - Json(req): Json, -) -> Result> { - let claims = require_fleet_oidc(&s, &headers).await?; - let new_owner: Option = { - let trimmed = req.agent_owner.trim(); - if trimmed.is_empty() { - None - } else { - let kind = gh_oidc::PrincipalKind::parse(&req.agent_owner_kind)?; - Some(gh_oidc::Principal::from_parts( - trimmed.to_string(), - req.agent_owner_id, - kind, - )?) - } - }; - let previous = { - let mut guard = s.agent_owner.write().await; - let prev = guard.clone(); - guard.clone_from(&new_owner); - prev - }; - // Taint only when transitioning to a NON-fleet owner. Clearing - // (new_owner == None) leaves the existing flag set — we don't - // untaint a node that was ever customer-owned, since a past - // tenant could have exfiltrated via /exec while their window - // was active. Setting to a new tenant is idempotent via HashSet. - if new_owner.is_some() { - s.taint.insert(TaintReason::CustomerOwnerEnabled).await; - } - let display = |o: &Option| -> String { - o.as_ref() - .map(|p| format!("{}({}/{})", p.name, p.kind.as_str(), p.id)) - .unwrap_or_else(|| "".into()) - }; - eprintln!( - "agent: /owner {} -> {} (by sub={}, claim_id={})", - display(&previous), - display(&new_owner), - claims.sub, - if req.claim_id.is_empty() { - "" - } else { - req.claim_id.as_str() - }, - ); - // Same back-compat as /health: existing callers (s12e bot, - // owner-update.yml) parse `agent_owner` / `previous_owner` as - // strings. Keep them strings here too; surface the structured - // form on `*_principal`. - Ok(Json(serde_json::json!({ - "agent_id": s.agent_id, - "agent_owner": new_owner.as_ref().map(|p| p.name.clone()), - "agent_owner_principal": new_owner, - "previous_owner": previous.as_ref().map(|p| p.name.clone()), - "previous_owner_principal": previous, - "claim_id": req.claim_id, - }))) -} diff --git a/src/cf.rs b/src/cf.rs deleted file mode 100644 index 1472a51..0000000 --- a/src/cf.rs +++ /dev/null @@ -1,873 +0,0 @@ -//! Cloudflare tunnel + DNS client. -//! -//! Naming convention (enforced here): CP tunnels are `dd-{env}-cp-{uuid}`, -//! agent tunnels are `dd-{env}-agent-{uuid}`. The suffix is how the -//! collector knows which tunnels to scrape and STONITH knows which to -//! target, without ever fetching the ingress config. - -use base64::Engine; -use reqwest::{Client, Method}; -use serde::{Deserialize, Serialize}; - -use crate::config::CfCreds; -use crate::error::{Error, Result}; - -const API: &str = "https://api.cloudflare.com/client/v4"; - -pub fn cp_tunnel_name(env: &str) -> String { - format!("dd-{env}-cp-{}", uuid::Uuid::new_v4()) -} -pub fn agent_tunnel_name(env: &str) -> String { - format!("dd-{env}-agent-{}", uuid::Uuid::new_v4()) -} -pub fn cp_prefix(env: &str) -> String { - format!("dd-{env}-cp-") -} -pub fn agent_prefix(env: &str) -> String { - format!("dd-{env}-agent-") -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Tunnel { - pub id: String, - pub token: String, - pub hostname: String, - #[serde(default)] - pub extra_hostnames: Vec, -} - -/// One-liner HTTP wrapper: all CF calls go through this. Returns the -/// parsed JSON body on any 2xx, turns anything else into `Error::Upstream` -/// carrying the response text. -async fn call( - http: &Client, - cf: &CfCreds, - method: Method, - path: &str, - body: Option, -) -> Result { - let mut req = http - .request(method.clone(), format!("{API}{path}")) - .bearer_auth(&cf.api_token); - if let Some(b) = body { - req = req.json(&b); - } - let resp = req.send().await?; - let status = resp.status(); - if !status.is_success() { - let body = resp.text().await.unwrap_or_default(); - return Err(Error::Upstream(format!( - "CF {method} {path} → {status}: {body}" - ))); - } - let parsed: serde_json::Value = resp.json().await?; - // CF v4 API returns 200 with {success: false, errors: [...]} on - // validation failures. We used to silently drop those — which led - // to Access apps that appeared created but never matched any - // requests (manifested as /health etc. 503'ing behind the root - // human app). Promote to Err so the CP's `?` unwinding catches it. - if parsed.get("success") == Some(&serde_json::Value::Bool(false)) { - let errors = parsed - .get("errors") - .map(|e| e.to_string()) - .unwrap_or_default(); - return Err(Error::Upstream(format!( - "CF {method} {path}: success=false errors={errors}" - ))); - } - Ok(parsed) -} - -/// Create (or recreate) a CF tunnel with ingress pointing at the local -/// service on port 8080, a proxied CNAME for `hostname`, and one -/// additional `{label}.{hostname}` → `localhost:{port}` ingress + -/// CNAME per entry in `extras`. Extras are prepended to the ingress -/// rules so they match before the primary wildcard catch-all. -pub async fn create( - http: &Client, - cf: &CfCreds, - name: &str, - hostname: &str, - extras: &[(String, u16)], -) -> Result { - delete_by_name(http, cf, name).await; - - let secret = base64::engine::general_purpose::STANDARD.encode(uuid::Uuid::new_v4().as_bytes()); - // `config_src: "cloudflare"` marks the tunnel as dashboard/API-managed - // so cloudflared won't try to push its (empty) local config at connect - // time. Without this, cloudflared 2026.3.0 logs - // `ERR unable to send local configuration … Invalid ConfigurationSource - // change` and the tunnel never registers a connection — the CP's - // /health endpoint stays unreachable and `Wait for agent health` - // times out. - let resp = call( - http, - cf, - Method::POST, - &format!("/accounts/{}/cfd_tunnel", cf.account_id), - Some(serde_json::json!({ - "name": name, - "tunnel_secret": secret, - "config_src": "cloudflare", - })), - ) - .await?; - - let id = resp["result"]["id"] - .as_str() - .ok_or_else(|| Error::Upstream("tunnel create: missing id".into()))? - .to_string(); - let token = resp["result"]["token"] - .as_str() - .ok_or_else(|| Error::Upstream("tunnel create: missing token".into()))? - .to_string(); - - let extra_hostnames = apply_ingress(http, cf, &id, hostname, extras).await?; - - Ok(Tunnel { - id, - token, - hostname: hostname.to_string(), - extra_hostnames, - }) -} - -/// Replace an existing tunnel's ingress rules + CNAME records. Used -/// for runtime updates — e.g. a workload POSTed to `/deploy` declares -/// `expose`, the agent forwards the full current extras list to the -/// CP, and the CP calls this to re-PUT the tunnel config without -/// recreating the tunnel or touching the tunnel token. Returns the -/// resolved extra hostnames so the caller can log / store them. -pub async fn update_ingress( - http: &Client, - cf: &CfCreds, - tunnel_id: &str, - hostname: &str, - extras: &[(String, u16)], -) -> Result> { - apply_ingress(http, cf, tunnel_id, hostname, extras).await -} - -/// Turn `(hostname="pr-144.devopsdefender.com", label="term")` into -/// `"pr-144-term.devopsdefender.com"`. Cloudflare's Universal SSL -/// only covers one level of wildcard (`*.devopsdefender.com`), so -/// we can't nest sub-workload subdomains under the agent's hostname -/// — the TLS handshake fails for `foo.bar.devopsdefender.com`. -/// Flattening the prefix keeps every workload URL one-level deep -/// under the zone apex. -pub fn label_hostname(hostname: &str, label: &str) -> String { - match hostname.split_once('.') { - Some((base, rest)) => format!("{base}-{label}.{rest}"), - None => format!("{hostname}-{label}"), - } -} - -/// Build the ingress array (extras first, then the primary -/// `hostname → localhost:8080` rule, then the 404 catch-all), PUT -/// it to the tunnel, and upsert a CNAME for each hostname pointing -/// at `{tunnel_id}.cfargotunnel.com`. -async fn apply_ingress( - http: &Client, - cf: &CfCreds, - tunnel_id: &str, - hostname: &str, - extras: &[(String, u16)], -) -> Result> { - let mut ingress: Vec = extras - .iter() - .map(|(label, port)| { - serde_json::json!({ - "hostname": label_hostname(hostname, label), - "service": format!("http://localhost:{port}"), - }) - }) - .collect(); - ingress.push(serde_json::json!({ - "hostname": hostname, - "service": "http://localhost:8080", - })); - ingress.push(serde_json::json!({"service": "http_status:404"})); - - call( - http, - cf, - Method::PUT, - &format!( - "/accounts/{}/cfd_tunnel/{tunnel_id}/configurations", - cf.account_id - ), - Some(serde_json::json!({"config": {"ingress": ingress}})), - ) - .await?; - - upsert_cname(http, cf, tunnel_id, hostname).await?; - let mut extra_hostnames = Vec::with_capacity(extras.len()); - for (label, _) in extras { - let extra = label_hostname(hostname, label); - upsert_cname(http, cf, tunnel_id, &extra).await?; - extra_hostnames.push(extra); - } - Ok(extra_hostnames) -} - -async fn upsert_cname(http: &Client, cf: &CfCreds, tunnel_id: &str, hostname: &str) -> Result<()> { - let content = format!("{tunnel_id}.cfargotunnel.com"); - let body = serde_json::json!({ - "type": "CNAME", "name": hostname, "content": content, "proxied": true, - }); - match find_record_id(http, cf, hostname).await? { - Some(rec) => { - call( - http, - cf, - Method::PUT, - &format!("/zones/{}/dns_records/{rec}", cf.zone_id), - Some(body), - ) - .await?; - } - None => { - call( - http, - cf, - Method::POST, - &format!("/zones/{}/dns_records", cf.zone_id), - Some(body), - ) - .await?; - } - } - Ok(()) -} - -pub async fn find_record_id(http: &Client, cf: &CfCreds, hostname: &str) -> Result> { - // Best-effort lookup: CF returns 200 with an empty array on miss. - let resp = call( - http, - cf, - Method::GET, - &format!( - "/zones/{}/dns_records?type=CNAME&name={hostname}", - cf.zone_id - ), - None, - ) - .await - .unwrap_or(serde_json::Value::Null); - Ok(resp["result"] - .as_array() - .and_then(|a| a.first()) - .and_then(|r| r["id"].as_str()) - .map(String::from)) -} - -pub async fn delete_cname(http: &Client, cf: &CfCreds, hostname: &str) -> Result<()> { - if let Some(id) = find_record_id(http, cf, hostname).await? { - let _ = call( - http, - cf, - Method::DELETE, - &format!("/zones/{}/dns_records/{id}", cf.zone_id), - None, - ) - .await; - } - Ok(()) -} - -/// Best-effort delete by name. Used for STONITH + idempotent re-create; -/// callers can't act usefully on failure. -pub async fn delete_by_name(http: &Client, cf: &CfCreds, name: &str) { - let Ok(resp) = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/cfd_tunnel?name={name}", cf.account_id), - None, - ) - .await - else { - return; - }; - let Some(items) = resp["result"].as_array() else { - return; - }; - for t in items { - let Some(id) = t["id"].as_str() else { continue }; - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/cfd_tunnel/{id}/connections", cf.account_id), - None, - ) - .await; - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/cfd_tunnel/{id}", cf.account_id), - None, - ) - .await; - } -} - -pub async fn list(http: &Client, cf: &CfCreds) -> Result> { - // `per_page=200` — CF's default is 20, and every caller here scans - // for a name-prefix match. A predecessor CP tunnel sorting off page 1 - // silently breaks both `stonith::kill_old_tunnels` (leaks the old - // tunnel instead of STONITHing it) and `cp::run`'s hydrate gate - // (skips hydrate, losing devices + agents on a deploy). 200 is the - // same cap `.github/workflows/cleanup.yml` uses. - let resp = call( - http, - cf, - Method::GET, - &format!( - "/accounts/{}/cfd_tunnel?is_deleted=false&per_page=200", - cf.account_id - ), - None, - ) - .await?; - Ok(resp["result"].as_array().cloned().unwrap_or_default()) -} - -/// `Some(true)` if present, `Some(false)` if confirmed deleted, `None` -/// on ambiguous transport error — the watchdog uses `None` to mean -/// "don't count as gone" and avoid flaky kernel_poweroffs. -pub async fn exists(http: &Client, cf: &CfCreds, tunnel_id: &str) -> Option { - let resp = http - .get(format!( - "{API}/accounts/{}/cfd_tunnel/{tunnel_id}", - cf.account_id - )) - .bearer_auth(&cf.api_token) - .send() - .await - .ok()?; - match resp.status().as_u16() { - 404 => Some(false), - s if (200..300).contains(&s) => { - let body: serde_json::Value = resp.json().await.ok()?; - Some(body["result"]["deleted_at"].is_null()) - } - _ => None, - } -} - -// ── CF Access (Zero Trust) provisioning ──────────────────────────────── -// -// The CP provisions a handful of Access apps at startup and one human -// app per agent at /register. Everything machine-to-machine uses -// bypass apps + in-code auth (ITA for /register + /ingress/replace, -// GitHub Actions OIDC for agent /deploy + /exec). No service tokens, -// no External Evaluation — just CF Access for humans and bypass for -// everything else. - -/// Return the UUID of the GitHub login method in this CF Access -/// account, if configured. Manual one-time setup in the Cloudflare -/// dashboard (Zero Trust → Settings → Authentication → Login methods -/// → add GitHub) is required before the CP can provision org-based -/// policies. -pub async fn github_idp_uuid(http: &Client, cf: &CfCreds) -> Result { - let resp = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/access/identity_providers", cf.account_id), - None, - ) - .await?; - resp["result"] - .as_array() - .and_then(|items| items.iter().find(|i| i["type"].as_str() == Some("github"))) - .and_then(|i| i["id"].as_str()) - .map(String::from) - .ok_or_else(|| { - Error::Upstream( - "CF Access has no GitHub identity provider — add one in \ - Zero Trust → Settings → Authentication → Login methods" - .into(), - ) - }) -} - -/// List all Access apps, return the full app JSON for one whose primary -/// or included `domain` exactly matches `domain`, or `None`. -async fn find_app_by_domain( - http: &Client, - cf: &CfCreds, - domain: &str, -) -> Result> { - let resp = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/access/apps?per_page=1000", cf.account_id), - None, - ) - .await?; - Ok(resp["result"].as_array().and_then(|items| { - items - .iter() - .find(|a| a["domain"].as_str() == Some(domain)) - .cloned() - })) -} - -/// Build the human-facing CF Access policy used for the CP root and -/// each per-agent dashboard. `admin_email` is always included as the -/// operator escape hatch. The GitHub-side rule depends on the -/// principal's kind, because Cloudflare Access has no include rule -/// for "specific GitHub user login" or "specific repository": -/// -/// kind=org → adds a `github-organization` include — anyone in -/// that org login is admitted. -/// kind=user → admin_email-only. CF Access can't gate on a -/// specific GitHub user login by name; the operator -/// gets in by email. -/// kind=repo → admin_email-only, same reason. -/// -/// The two non-org kinds losing GitHub-side dashboard access is not -/// a regression: the prior behavior — a `github-organization` rule -/// configured with a user login — silently matched nobody. -fn human_policy( - owner: &crate::gh_oidc::Principal, - admin_email: &str, - gh_idp_uuid: &str, -) -> serde_json::Value { - let mut includes = vec![serde_json::json!({ - "email": { "email": admin_email } - })]; - if let crate::gh_oidc::PrincipalKind::Org = owner.kind { - includes.push(serde_json::json!({ - "github-organization": { - "name": owner.name, - "identity_provider_id": gh_idp_uuid, - } - })); - } - serde_json::json!({ - "name": "dd-human", - "decision": "allow", - "include": includes, - }) -} - -fn bypass_policy() -> serde_json::Value { - serde_json::json!({ - "name": "dd-bypass", - "decision": "bypass", - "include": [ { "everyone": {} } ], - }) -} - -/// Idempotently upsert a self-hosted Access app at `domain` with the -/// provided policy list. Matches on exact `domain`; updates in place -/// if present, creates otherwise. -async fn ensure_app( - http: &Client, - cf: &CfCreds, - name: &str, - domain: &str, - policies: Vec, -) -> Result { - let body = serde_json::json!({ - "name": name, - "domain": domain, - "type": "self_hosted", - "session_duration": "24h", - "app_launcher_visible": false, - "policies": policies, - }); - if let Some(existing) = find_app_by_domain(http, cf, domain).await? { - let id = existing["id"].as_str().unwrap_or_default().to_string(); - call( - http, - cf, - Method::PUT, - &format!("/accounts/{}/access/apps/{id}", cf.account_id), - Some(body), - ) - .await?; - return Ok(id); - } - let resp = call( - http, - cf, - Method::POST, - &format!("/accounts/{}/access/apps", cf.account_id), - Some(body), - ) - .await?; - resp["result"]["id"] - .as_str() - .map(String::from) - .ok_or_else(|| Error::Upstream(format!("Access app create missing id for {domain}"))) -} - -/// Idempotently upsert a path-scoped bypass Access app. Anyone can -/// reach `domain/path` without authentication; used for /health, -/// /register (which is authenticated by ITA in-app), and every -/// workload-exposed URL. -async fn ensure_bypass_app(http: &Client, cf: &CfCreds, name: &str, domain: &str) -> Result<()> { - ensure_app(http, cf, name, domain, vec![bypass_policy()]).await?; - Ok(()) -} - -/// Hostname labels that run admin workloads (shell access, future -/// log viewer, future metrics panel). These get a human CF Access -/// app, not a public bypass — otherwise exposing ttyd on a public -/// subdomain would be a free shell for the internet. -/// -/// - `term` — legacy ttyd subdomain (kept admin-gated for older deploys). -/// - `block` — ttyd workload; exposing it without auth is the same -/// "free shell for the internet" risk. -const ADMIN_LABELS: &[&str] = &["term", "block"]; - -fn is_admin_label(label: &str) -> bool { - ADMIN_LABELS.contains(&label) -} - -/// Provision the CP's Access apps at startup. -/// -/// Apps created: -/// - Human: `{hostname}` — GitHub org or admin email (dashboard, /agent/*, /cp/*) -/// - Human: `term.{hostname}` — ttyd shell, org members only -/// - Bypass: `{hostname}/health` — public (read-only fleet health; -/// also carries the Noise pre-handshake `{quote_b64, pubkey_hex}`) -/// - Bypass: `{hostname}/api/agents` — read-only agent list -/// - Bypass: `{hostname}/noise/ws` — Noise_IK-gated in code -/// - Bypass: `{hostname}/register` — ITA-gated in code -/// - Bypass: `{hostname}/ingress/replace` — ITA-gated in code -pub async fn provision_cp_access( - http: &Client, - cf: &CfCreds, - env: &str, - hostname: &str, - owner: &crate::gh_oidc::Principal, - admin_email: &str, - workload_labels: &[String], -) -> Result<()> { - let idp = github_idp_uuid(http, cf).await?; - let human = human_policy(owner, admin_email, &idp); - - ensure_app( - http, - cf, - &format!("dd-{env}-cp"), - hostname, - vec![human.clone()], - ) - .await?; - - // One CF Access app per CP-exposed workload label. Admin labels - // (from `ADMIN_LABELS` — e.g. the ttyd terminal at `block`) get - // the human policy; anything else gets a public bypass. - let desired: std::collections::HashSet = workload_labels - .iter() - .map(|l| label_hostname(hostname, l)) - .collect(); - for label in workload_labels { - let domain = label_hostname(hostname, label); - if is_admin_label(label) { - ensure_app( - http, - cf, - &format!("dd-{env}-cp-{label}"), - &domain, - vec![human_policy(owner, admin_email, &idp)], - ) - .await?; - } else { - ensure_bypass_app(http, cf, &format!("dd-{env}-cp-{label}"), &domain).await?; - } - } - - // Reap any stale workload apps under the CP's flat subdomain space - // that are no longer in the desired label set. Shape is - // `{base}-{label}.{tld}` — prefix+suffix match so we don't touch - // the CP's own root human app or any bypass-path apps. - let (base, tld) = hostname.split_once('.').unwrap_or((hostname, "")); - let prefix = format!("{base}-"); - let suffix_tld = format!(".{tld}"); - if let Ok(resp) = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/access/apps?per_page=1000", cf.account_id), - None, - ) - .await - { - if let Some(items) = resp["result"].as_array() { - for app in items { - let Some(domain) = app["domain"].as_str() else { - continue; - }; - if !(domain.starts_with(&prefix) && domain.ends_with(&suffix_tld)) { - continue; - } - if desired.contains(domain) { - continue; - } - if let Some(id) = app["id"].as_str() { - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/access/apps/{id}", cf.account_id), - None, - ) - .await; - } - } - } - } - // One-shot reap: any prior deploy left behind a - // `dd-{env}-cp-noise-attest` bypass app fronting `{hostname}/attest`. - // The attest route is gone, so the bypass has no job. The workload - // sweep above is subdomain-scoped and won't catch path-bypass apps, - // so delete it explicitly here. Lookup-by-domain is idempotent; - // no-op once the app is gone. - if let Ok(Some(stale)) = find_app_by_domain(http, cf, &format!("{hostname}/attest")).await { - if let Some(id) = stale["id"].as_str() { - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/access/apps/{id}", cf.account_id), - None, - ) - .await; - } - } - - for (path_suffix, label) in [ - ("/health", "health"), - ("/api/agents", "api-agents"), - ("/api/v1/devices/trusted", "api-devices-trusted"), - ("/api/v1/admin/export", "api-admin-export"), - // The Noise pre-handshake bundle (`quote_b64` + `pubkey_hex`) - // now rides on `/health` — `/attest` was deleted. Only the - // handshake itself still needs its own bypass. - ("/noise/ws", "noise-ws"), - ("/register", "register"), - ("/ingress/replace", "ingress"), - ] { - ensure_bypass_app( - http, - cf, - &format!("dd-{env}-cp-{label}"), - &format!("{hostname}{path_suffix}"), - ) - .await?; - } - Ok(()) -} - -/// Provision Access apps for one agent. Called at /register and again -/// at /ingress/replace so newly-exposed workload labels get their -/// bypass apps and labels that disappeared get cleaned up. -/// -/// - Human: `{agent}.{domain}` — browser dashboard only -/// - Human: `.{agent}.{domain}` for labels in -/// `ADMIN_LABELS` (ttyd et al) — org members only -/// - Bypass: `{agent}.{domain}/health` — public; carries the Noise -/// pre-handshake `{quote_b64, pubkey_hex}` in its response -/// - Bypass: `{agent}.{domain}/deploy` — GH-OIDC-gated in code -/// - Bypass: `{agent}.{domain}/exec` — GH-OIDC-gated in code -/// - Bypass: `{agent}.{domain}/owner` — fleet-GH-OIDC-gated in code -/// - Bypass: `{agent}.{domain}/logs/*` — GH-OIDC-gated in code -/// - Bypass: `{agent}.{domain}/noise/ws` — Noise_IK-gated in code -/// against the CP-trusted paired device pubkey set -/// - Bypass: `{label}.{agent}.{domain}` for other labels — workload -/// URLs are public by default (this is the nvidia-smi exemption). -/// - Any existing `*.{agent}.{domain}` app whose label is no longer -/// in `workload_labels` is deleted. -pub async fn provision_agent_access( - http: &Client, - cf: &CfCreds, - env: &str, - agent_hostname: &str, - owner: &crate::gh_oidc::Principal, - admin_email: &str, - workload_labels: &[String], -) -> Result<()> { - let idp = github_idp_uuid(http, cf).await?; - let human = human_policy(owner, admin_email, &idp); - - ensure_app( - http, - cf, - &format!("dd-{env}-agent-{agent_hostname}"), - agent_hostname, - vec![human], - ) - .await?; - for (suffix, label) in [ - ("/health", "health"), - ("/deploy", "deploy"), - ("/exec", "exec"), - ("/owner", "owner"), - ("/logs", "logs"), - // The in-process Noise gateway (merged into the agent's - // router in agent.rs) lives on the same port + hostname as - // /deploy + /exec, so a direct bastion-app attach needs the - // same bypass treatment the CP hostname already gets. The - // pre-handshake quote bundle is served by /health above - // (collapsed from the old /attest), so only the handshake - // WebSocket needs its own entry here. Noise_IK against the - // paired device pubkey set is the gate. - ("/noise/ws", "noise-ws"), - ] { - ensure_bypass_app( - http, - cf, - &format!("dd-{env}-agent-{agent_hostname}-{label}"), - &format!("{agent_hostname}{suffix}"), - ) - .await?; - } - - let desired: std::collections::HashSet = workload_labels - .iter() - .map(|l| label_hostname(agent_hostname, l)) - .collect(); - for label in workload_labels { - let domain = label_hostname(agent_hostname, label); - if is_admin_label(label) { - ensure_app( - http, - cf, - &format!("dd-{env}-workload-{domain}"), - &domain, - vec![human_policy(owner, admin_email, &idp)], - ) - .await?; - } else { - ensure_bypass_app(http, cf, &format!("dd-{env}-workload-{domain}"), &domain).await?; - } - } - - // Reap any stale workload apps under this agent that are no - // longer in the desired set. Flat-subdomain workload URLs look - // like `{base}-{label}.{tld}` (one level deep — see - // `label_hostname` for why). Prefix + suffix match so we don't - // accidentally delete the agent's own human app at - // `{base}.{tld}`. - let (base, tld) = agent_hostname - .split_once('.') - .unwrap_or((agent_hostname, "")); - let prefix = format!("{base}-"); - let suffix_tld = format!(".{tld}"); - let resp = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/access/apps?per_page=1000", cf.account_id), - None, - ) - .await?; - if let Some(items) = resp["result"].as_array() { - for app in items { - let Some(domain) = app["domain"].as_str() else { - continue; - }; - if !(domain.starts_with(&prefix) && domain.ends_with(&suffix_tld)) { - continue; - } - if desired.contains(domain) { - continue; - } - if let Some(id) = app["id"].as_str() { - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/access/apps/{id}", cf.account_id), - None, - ) - .await; - } - } - } - Ok(()) -} - -/// Cleanup hook invoked by the collector's orphan-GC path and by any -/// explicit reap: delete the agent's human app, its /health bypass, -/// and every `*.{agent_hostname}` workload bypass in one sweep. -pub async fn delete_access_apps_for(http: &Client, cf: &CfCreds, agent_hostname: &str) { - let Ok(resp) = call( - http, - cf, - Method::GET, - &format!("/accounts/{}/access/apps?per_page=1000", cf.account_id), - None, - ) - .await - else { - return; - }; - let Some(items) = resp["result"].as_array() else { - return; - }; - let (base, tld) = agent_hostname - .split_once('.') - .unwrap_or((agent_hostname, "")); - let prefix = format!("{base}-"); - let suffix_tld = format!(".{tld}"); - for app in items { - let Some(domain) = app["domain"].as_str() else { - continue; - }; - // Match: the agent hostname itself, any path-scoped app - // under it, and flat-subdomain workload URLs of the shape - // `{base}-*.{tld}`. - let matches_agent = domain == agent_hostname - || domain.starts_with(&format!("{agent_hostname}/")) - || (domain.starts_with(&prefix) && domain.ends_with(&suffix_tld)); - if !matches_agent { - continue; - } - if let Some(id) = app["id"].as_str() { - let _ = call( - http, - cf, - Method::DELETE, - &format!("/accounts/{}/access/apps/{id}", cf.account_id), - None, - ) - .await; - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn label_hostname_flattens_to_one_level() { - assert_eq!( - label_hostname("pr-144.devopsdefender.com", "term"), - "pr-144-term.devopsdefender.com" - ); - assert_eq!( - label_hostname("dd-pr-144-agent-abc.devopsdefender.com", "gpu"), - "dd-pr-144-agent-abc-gpu.devopsdefender.com" - ); - assert_eq!( - label_hostname("app.devopsdefender.com", "term"), - "app-term.devopsdefender.com" - ); - } - - #[test] - fn label_hostname_handles_dotless_hostname() { - assert_eq!(label_hostname("localhost", "term"), "localhost-term"); - } -} diff --git a/src/collector.rs b/src/collector.rs deleted file mode 100644 index e36e7fb..0000000 --- a/src/collector.rs +++ /dev/null @@ -1,365 +0,0 @@ -//! Background collector — discovers agents, scrapes `/health`, GC's dead tunnels. -//! -//! Every agent entry in the store carries freshly-verified ITA claims. The -//! collector scrapes `/health`, extracts the `ita_token` field, and runs -//! it through the CP's verifier; agents whose tokens are missing, expired, -//! or mis-signed don't enter the store. One tick: -//! -//! 1. List CF tunnels whose name starts with `dd-{env}-agent-`. -//! 2. Scrape `https://{tunnel-name}.{domain}/health` in parallel. -//! 3. Verify the `ita_token` field from each /health body. -//! 4. Insert on success, including tunnel id and reported ingress. -//! 5. Mark dead / GC tunnel on repeated scrape failures. -//! 6. Refresh the `control-plane` entry (its claims come from CP startup). - -use std::collections::HashMap; -use std::sync::Arc; -use std::time::Duration; - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use tokio::sync::Mutex; - -use crate::cf; -use crate::config::CfCreds; -use crate::ee::Ee; -use crate::ita; - -const DEAD_THRESHOLD_SECS: i64 = 300; - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Agent { - pub agent_id: String, - pub hostname: String, - pub vm_name: String, - pub attestation_type: String, - pub status: String, - pub last_seen: DateTime, - pub deployment_count: usize, - pub deployment_names: Vec, - pub cpu_percent: u64, - pub memory_used_mb: u64, - pub memory_total_mb: u64, - #[serde(default)] - pub nets: Vec, - #[serde(default)] - pub disks: Vec, - /// Intel-verified ITA claims. Required — agents without a valid - /// token don't enter the store. - pub ita: ita::Claims, - /// CF tunnel ID (not name) — needed to re-PUT ingress at runtime - /// when a POSTed workload declares `expose`. Empty for the - /// `control-plane` pseudo-entry which doesn't take runtime slop. - #[serde(default)] - pub tunnel_id: String, - /// Currently-active per-workload ingress rules for this agent's - /// tunnel. Seeded at /register from the boot-workload `expose` - /// set; appended on each runtime /ingress/replace call. If the - /// agent relaunches, the CP re-seeds from the new register's - /// `extra_ingress` field. If only the CP restarts, the collector - /// recovers this list from the agent's `/health` response. - #[serde(default)] - pub extras: Vec<(String, u16)>, -} - -pub type Store = Arc>>; - -#[allow(clippy::too_many_arguments)] -pub async fn run( - store: Store, - cf: CfCreds, - env_label: String, - cp_hostname: String, - ee: Arc, - verifier: Arc, - interval: Duration, -) -> ! { - let prefix = cf::agent_prefix(&env_label); - let http = reqwest::Client::builder() - .timeout(Duration::from_secs(5)) - .build() - .unwrap_or_else(|_| reqwest::Client::new()); - - eprintln!( - "cp: collector starting (prefix={prefix}*, interval={}s)", - interval.as_secs() - ); - - let mut ticker = tokio::time::interval(interval); - loop { - ticker.tick().await; - tick( - &store, - &http, - &cf, - &prefix, - &ee, - &verifier, - &env_label, - &cp_hostname, - ) - .await; - } -} - -#[allow(clippy::too_many_arguments)] -async fn tick( - store: &Store, - http: &reqwest::Client, - cf: &CfCreds, - prefix: &str, - ee: &Arc, - verifier: &Arc, - env_label: &str, - cp_hostname: &str, -) { - let tunnels: Vec<(String, String, String)> = cf::list(http, cf) - .await - .unwrap_or_default() - .into_iter() - .filter_map(|t| { - let name = t["name"].as_str()?; - let id = t["id"].as_str()?; - if !name.starts_with(prefix) { - return None; - } - let hostname = format!("{name}.{}", cf.domain); - Some((name.to_string(), id.to_string(), hostname)) - }) - .collect(); - - let scrapes = tunnels.iter().map(|(name, tunnel_id, host)| { - let http = http.clone(); - let name = name.clone(); - let tunnel_id = tunnel_id.clone(); - let host = host.clone(); - async move { - let r = http.get(format!("https://{host}/health")).send().await; - match r { - Ok(resp) if resp.status().is_success() => { - let body = resp.json::().await.ok(); - (name, tunnel_id, host, body, None) - } - Ok(resp) => ( - name, - tunnel_id, - host, - None, - Some(format!("status {}", resp.status())), - ), - Err(e) => (name, tunnel_id, host, None, Some(e.to_string())), - } - } - }); - let results = futures_util::future::join_all(scrapes).await; - - let now = Utc::now(); - let mut orphans: Vec<(String, String)> = Vec::new(); - let mut verified = 0usize; - - for (name, tunnel_id, host, body, err) in &results { - let Some(h) = body else { - mark_stale_or_orphan(store, host, name, err, now, &mut orphans).await; - continue; - }; - let Some(token) = h["ita_token"].as_str() else { - eprintln!("cp: collector: {name} /health lacks ita_token — skipping"); - continue; - }; - let claims = match verifier.verify(token).await { - Ok(c) => c, - Err(e) => { - eprintln!("cp: collector: {name} ITA verify failed: {e}"); - continue; - } - }; - // Store key is the tunnel name (authoritative on the CP side), - // NOT the agent's self-reported agent_id. - let mut s = store.lock().await; - let extras = parse_extra_ingress(h) - .unwrap_or_else(|| s.get(name).map(|a| a.extras.clone()).unwrap_or_default()); - s.insert( - name.clone(), - Agent { - agent_id: name.clone(), - hostname: host.clone(), - vm_name: h["vm_name"].as_str().unwrap_or("unknown").to_string(), - attestation_type: h["attestation_type"] - .as_str() - .unwrap_or("unknown") - .to_string(), - status: "healthy".into(), - last_seen: now, - deployment_count: h["deployment_count"].as_u64().unwrap_or(0) as usize, - deployment_names: h["deployments"] - .as_array() - .map(|a| { - a.iter() - .filter_map(|v| v.as_str().map(String::from)) - .collect() - }) - .unwrap_or_default(), - cpu_percent: h["cpu_percent"].as_u64().unwrap_or(0), - memory_used_mb: h["memory_used_mb"].as_u64().unwrap_or(0), - memory_total_mb: h["memory_total_mb"].as_u64().unwrap_or(0), - nets: serde_json::from_value(h["nets"].clone()).unwrap_or_default(), - disks: serde_json::from_value(h["disks"].clone()).unwrap_or_default(), - ita: claims, - tunnel_id: tunnel_id.clone(), - extras, - }, - ); - drop(s); - verified += 1; - } - - // Collect + delete dead entries from the store. - let dead: Vec = { - let s = store.lock().await; - s.iter() - .filter(|(_, a)| a.status == "dead") - .map(|(k, _)| k.clone()) - .collect() - }; - for k in &dead { - store.lock().await.remove(k); - } - - if !orphans.is_empty() { - for (name, host) in &orphans { - eprintln!("cp: GC dead tunnel {name}"); - cf::delete_by_name(http, cf, name).await; - let _ = cf::delete_cname(http, cf, host).await; - // Sweep the agent's CF Access apps — its human dashboard - // app and every workload-URL bypass under this hostname. - // Without this the account accumulates dead apps every - // STONITH cycle. - cf::delete_access_apps_for(http, cf, host).await; - } - } - - // Refresh the CP's own entry. Its ITA claims were seeded at CP - // startup and are preserved here across ticks; everything else - // (status, workloads, attestation) gets refreshed from EE. - let (deployments, attestation) = match ee.list().await { - Ok(deps) => { - let names: Vec = deps["deployments"] - .as_array() - .map(|a| { - a.iter() - .filter_map(|v| v["app_name"].as_str().map(String::from)) - .collect() - }) - .unwrap_or_default(); - let att = ee - .health() - .await - .ok() - .and_then(|h| h["attestation_type"].as_str().map(String::from)) - .unwrap_or_else(|| "tdx".into()); - (names, att) - } - Err(_) => (vec![], "tdx".into()), - }; - let count = deployments.len(); - let mut store_lock = store.lock().await; - if let Some(cp) = store_lock.get_mut("control-plane") { - cp.hostname = cp_hostname.to_string(); - cp.vm_name = format!("dd-{env_label}-cp"); - cp.attestation_type = attestation; - cp.status = "healthy".into(); - cp.last_seen = now; - cp.deployment_count = count; - cp.deployment_names = deployments; - } - drop(store_lock); - - eprintln!( - "cp: scraped {} tunnels ({verified} verified, {} orphans GC'd)", - results.len(), - orphans.len() - ); -} - -fn parse_extra_ingress(h: &serde_json::Value) -> Option> { - h.get("extra_ingress")?.as_array().map(|items| { - items - .iter() - .filter_map(|item| { - let label = item.get("hostname_label")?.as_str()?; - let port = item.get("port")?.as_u64()?; - if label.is_empty() || port == 0 || port > u16::MAX as u64 { - return None; - } - Some((label.to_string(), port as u16)) - }) - .collect() - }) -} - -async fn mark_stale_or_orphan( - store: &Store, - host: &str, - name: &str, - err: &Option, - now: DateTime, - orphans: &mut Vec<(String, String)>, -) { - let mut s = store.lock().await; - if let Some(a) = s.values_mut().find(|a| a.hostname == *host) { - let age = now.signed_duration_since(a.last_seen).num_seconds(); - if age > DEAD_THRESHOLD_SECS { - a.status = "dead".into(); - orphans.push((name.to_string(), host.to_string())); - } else { - a.status = "stale".into(); - } - } else if err - .as_ref() - .is_some_and(|e| e.contains("connect") || e.contains("timed out")) - { - orphans.push((name.to_string(), host.to_string())); - } -} - -#[cfg(test)] -mod tests { - use super::parse_extra_ingress; - - #[test] - fn missing_extra_ingress_preserves_existing_state() { - let h = serde_json::json!({}); - - assert_eq!(parse_extra_ingress(&h), None); - } - - #[test] - fn parses_valid_extra_ingress() { - let h = serde_json::json!({ - "extra_ingress": [ - {"hostname_label": "gpu", "port": 8081}, - {"hostname_label": "web", "port": 9000} - ] - }); - - assert_eq!( - parse_extra_ingress(&h), - Some(vec![("gpu".into(), 8081), ("web".into(), 9000)]) - ); - } - - #[test] - fn drops_malformed_extra_ingress_entries() { - let h = serde_json::json!({ - "extra_ingress": [ - {"hostname_label": "gpu", "port": 8081}, - {"hostname_label": "", "port": 8082}, - {"hostname_label": "bad-zero", "port": 0}, - {"hostname_label": "bad-wide", "port": 70000}, - {"hostname_label": "bad-string", "port": "8083"} - ] - }); - - assert_eq!(parse_extra_ingress(&h), Some(vec![("gpu".into(), 8081)])); - } -} diff --git a/src/config.rs b/src/config.rs deleted file mode 100644 index d155a26..0000000 --- a/src/config.rs +++ /dev/null @@ -1,367 +0,0 @@ -//! Environment-derived configuration for both modes. - -use crate::error::{Error, Result}; -use crate::gh_oidc::{Principal, PrincipalKind}; - -#[derive(Clone)] -pub struct CfCreds { - pub api_token: String, - pub account_id: String, - pub zone_id: String, - pub domain: String, -} - -impl CfCreds { - pub fn from_env() -> Result { - let get = |k: &str| std::env::var(k).map_err(|_| Error::Internal(format!("{k} not set"))); - Ok(Self { - api_token: get("DD_CF_API_TOKEN")?, - account_id: get("DD_CF_ACCOUNT_ID")?, - zone_id: get("DD_CF_ZONE_ID")?, - domain: get("DD_CF_DOMAIN")?, - }) - } -} - -/// Configuration shared between modes. -pub struct Common { - pub env_label: String, - pub port: u16, - pub owner: Principal, - pub vm_name: String, -} - -impl Common { - pub fn from_env() -> Result { - let env_label = std::env::var("DD_ENV").map_err(|_| { - Error::Internal("DD_ENV required (dev / staging / production / pr-*)".into()) - })?; - let port = std::env::var("DD_PORT") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(8080); - // DD_OWNER + DD_OWNER_ID + DD_OWNER_KIND together describe the - // principal authorized to deploy to this agent. - // - // kind=user|org → DD_OWNER is a GitHub login (no '/'). The - // verifier matches tokens whose - // repository_owner == DD_OWNER and - // repository_owner_id == DD_OWNER_ID. The - // two kinds differ only at CF Access: - // kind=org maps to a github-organization - // include rule; kind=user falls back to - // admin_email-only for the dashboard. - // kind=repo → DD_OWNER is "/" (one '/'), - // verifier matches repository == DD_OWNER - // and repository_id == DD_OWNER_ID. - // Dashboard CF Access falls back to - // admin_email-only. - // - // DD_OWNER_ID defeats login-squat — a deleted/transferred - // account whose login is later re-registered will produce - // tokens with a different numeric id and be rejected. - // - // All three are required at boot. Existing agents from before - // this change must be re-provisioned. - let owner_name = - std::env::var("DD_OWNER").map_err(|_| Error::Internal("DD_OWNER required".into()))?; - let owner_id: u64 = std::env::var("DD_OWNER_ID") - .map_err(|_| Error::Internal("DD_OWNER_ID required (numeric GitHub id)".into()))? - .parse() - .map_err(|e| Error::Internal(format!("DD_OWNER_ID parse: {e}")))?; - let owner_kind = PrincipalKind::parse( - &std::env::var("DD_OWNER_KIND") - .map_err(|_| Error::Internal("DD_OWNER_KIND required (user|org|repo)".into()))?, - )?; - let owner = Principal::from_parts(owner_name, owner_id, owner_kind)?; - let vm_name = std::env::var("DD_VM_NAME").unwrap_or_else(|_| { - std::fs::read_to_string("/etc/hostname") - .ok() - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| "unknown".into()) - }); - Ok(Self { - env_label, - port, - owner, - vm_name, - }) - } -} - -/// ITA (Intel Trust Authority) configuration. All fields required — -/// attestation is mandatory in both modes. -#[derive(Clone)] -pub struct Ita { - /// URL for Intel's ITA mint endpoint, e.g. `https://api.trustauthority.intel.com`. - pub base_url: String, - /// API key for the mint endpoint. - pub api_key: String, - /// JWKS endpoint for verifier. - pub jwks_url: String, - /// Expected `iss` claim. - pub issuer: String, -} - -impl Ita { - pub fn from_env() -> Result { - let get = |k: &str| { - std::env::var(k) - .ok() - .filter(|s| !s.is_empty()) - .ok_or_else(|| Error::Internal(format!("{k} required"))) - }; - Ok(Self { - base_url: get("DD_ITA_BASE_URL")?, - api_key: get("DD_ITA_API_KEY")?, - jwks_url: get("DD_ITA_JWKS_URL")?, - issuer: get("DD_ITA_ISSUER")?, - }) - } -} - -/// CF Access configuration — one email that's always allowed in -/// alongside GitHub org members. Required so the operator has a -/// break-glass login path if org membership checks break. -#[derive(Clone)] -pub struct CfAccess { - pub admin_email: String, -} - -impl CfAccess { - pub fn from_env() -> Result { - let admin_email = std::env::var("DD_ACCESS_ADMIN_EMAIL") - .map_err(|_| { - Error::Internal( - "DD_ACCESS_ADMIN_EMAIL required (break-glass human login for CF Access)".into(), - ) - })? - .trim() - .to_string(); - if admin_email.is_empty() || !admin_email.contains('@') { - return Err(Error::Internal( - "DD_ACCESS_ADMIN_EMAIL must be a valid email address".into(), - )); - } - Ok(Self { admin_email }) - } -} - -/// Control-plane-mode config. -pub struct Cp { - pub common: Common, - pub cf: CfCreds, - pub access: CfAccess, - pub hostname: String, - pub scrape_interval_secs: u64, - pub ita: Ita, - /// Source-of-truth file for the device registry (JSON). Survives - /// CP restart; mutations fsync through to disk. - pub devices_path: std::path::PathBuf, - /// Where the Noise gateway persists its X25519 static private key - /// (tmpfs). Fresh per-boot when missing. - pub noise_key_path: std::path::PathBuf, -} - -impl Cp { - pub fn from_env() -> Result { - let common = Common::from_env()?; - let cf = CfCreds::from_env()?; - let access = CfAccess::from_env()?; - let hostname = std::env::var("DD_HOSTNAME") - .map_err(|_| Error::Internal("DD_HOSTNAME required in CP mode".into()))?; - let scrape_interval_secs = std::env::var("DD_SCRAPE_INTERVAL") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(30); - // `/tmp/` (tmpfs) is the only universally-writable path in the - // EE workload sandbox — the root FS is RO, and `/var/lib/` + - // `/data/` are both unavailable on the CP VM (no mount-data - // in the CP boot set; root FS read-only for workloads). - // Ephemeral across CP restarts is OK: the zero-downtime - // boot hydrates devices from the predecessor CP via - // `/api/v1/admin/export` before flipping DNS, so the on-disk - // copy is a cache, not source of truth. - let devices_path = std::env::var("DD_CP_DEVICES_PATH") - .unwrap_or_else(|_| "/tmp/devopsdefender/devices.json".into()) - .into(); - let noise_key_path = std::env::var("DD_NOISE_KEY_PATH") - .unwrap_or_else(|_| "/run/devopsdefender/noise.key".into()) - .into(); - Ok(Self { - common, - cf, - access, - hostname, - scrape_interval_secs, - ita: Ita::from_env()?, - devices_path, - noise_key_path, - }) - } -} - -/// Agent-mode config. No PAT — the agent authenticates to the CP with -/// ITA attestation at /register and the CF Access service token -/// (received in the register response) on subsequent calls. -pub struct Agent { - pub common: Common, - pub cp_url: String, - pub ee_socket: String, - pub ita: Ita, - /// Extra cloudflared ingress rules requested at register time, - /// parsed from `DD_EXTRA_INGRESS` (a comma-separated list of - /// `label:port` pairs, e.g. `gpu:8081,web:9000`). The boot-workload - /// builder (`apps/_infra/local-agents.sh`) collects these from - /// `expose` hints on individual workload specs. Empty is fine — - /// the agent just gets the default dashboard rule. - pub extra_ingress: Vec<(String, u16)>, - /// Confidential-mode flag from `DD_CONFIDENTIAL`. When true, the - /// agent omits `/deploy`, `/exec`, and `/owner` from its router — - /// no one (not tenant, not ops) can mutate the running workload - /// post-boot. `/logs` + `/health` + attestation stay open. Used by - /// Sats for Compute's "confidential mode" product variant for - /// oracle / bot-oracle workloads where the operator proves to - /// third parties that the code is sealed. Backward-compatible: - /// unset = default (mutation endpoints enabled). - pub confidential: bool, -} - -impl Agent { - pub fn from_env() -> Result { - let common = Common::from_env()?; - let cp_url = std::env::var("DD_CP_URL").map_err(|_| { - Error::Internal("DD_CP_URL required (e.g. https://app.devopsdefender.com)".into()) - })?; - let ee_socket = std::env::var("EE_SOCKET_PATH") - .unwrap_or_else(|_| "/var/lib/easyenclave/agent.sock".into()); - let extra_ingress = parse_extra_ingress()?; - let confidential = parse_truthy("DD_CONFIDENTIAL"); - Ok(Self { - common, - cp_url, - ee_socket, - ita: Ita::from_env()?, - extra_ingress, - confidential, - }) - } -} - -/// Best-effort bool parser for env flags. Treats any of -/// `1 / true / yes / on` (case-insensitive) as true; everything else -/// (including empty and absent) as false. -fn parse_truthy(key: &str) -> bool { - std::env::var(key) - .map(|v| { - matches!( - v.trim().to_ascii_lowercase().as_str(), - "1" | "true" | "yes" | "on" - ) - }) - .unwrap_or(false) -} - -/// Parse `DD_EXTRA_INGRESS` as a comma-separated list of `label:port` -/// pairs — e.g. `"gpu:8081"` or `"gpu:8081,web:9000"`. Chosen over -/// JSON to sidestep `"`-escaping when the value is substituted into -/// the dd-agent workload template's `"DD_EXTRA_INGRESS=${…}"` env -/// entry (embedded quotes would close the outer JSON string early). -/// Empty / unset → empty Vec. -fn parse_extra_ingress() -> Result> { - let raw = match std::env::var("DD_EXTRA_INGRESS") { - Ok(s) if !s.trim().is_empty() => s, - _ => return Ok(Vec::new()), - }; - let mut out = Vec::new(); - for entry in raw.split(',') { - let entry = entry.trim(); - if entry.is_empty() { - continue; - } - let (label, port_s) = entry.split_once(':').ok_or_else(|| { - Error::Internal(format!( - "DD_EXTRA_INGRESS entry {entry:?}: expected label:port" - )) - })?; - let port: u16 = port_s.parse().map_err(|e| { - Error::Internal(format!( - "DD_EXTRA_INGRESS entry {entry:?}: port must be u16 ({e})" - )) - })?; - if label.is_empty() { - return Err(Error::Internal(format!( - "DD_EXTRA_INGRESS entry {entry:?}: empty label" - ))); - } - out.push((label.to_string(), port)); - } - Ok(out) -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::{Mutex, OnceLock}; - - static ENV_LOCK: OnceLock> = OnceLock::new(); - - fn parse(s: &str) -> Result> { - let _guard = ENV_LOCK - .get_or_init(|| Mutex::new(())) - .lock() - .unwrap_or_else(|e| e.into_inner()); - unsafe { - std::env::set_var("DD_EXTRA_INGRESS", s); - } - let r = parse_extra_ingress(); - unsafe { - std::env::remove_var("DD_EXTRA_INGRESS"); - } - r - } - - #[test] - fn empty_parses_to_empty_vec() { - assert!(parse("").unwrap().is_empty()); - assert!(parse(" ").unwrap().is_empty()); - } - - #[test] - fn single_entry() { - assert_eq!(parse("gpu:8081").unwrap(), vec![("gpu".into(), 8081)]); - } - - #[test] - fn multiple_entries() { - assert_eq!( - parse("gpu:8081,web:9000").unwrap(), - vec![("gpu".into(), 8081), ("web".into(), 9000)] - ); - } - - #[test] - fn tolerates_whitespace_and_trailing_commas() { - assert_eq!( - parse("gpu:8081, , web:9000,").unwrap(), - vec![("gpu".into(), 8081), ("web".into(), 9000)] - ); - } - - #[test] - fn bad_port_errors() { - assert!(parse("gpu:notaport").is_err()); - assert!(parse("gpu:99999").is_err()); // > u16 - } - - #[test] - fn missing_colon_errors() { - assert!(parse("gpu").is_err()); - } - - #[test] - fn empty_label_errors() { - assert!(parse(":8081").is_err()); - } -} diff --git a/src/cp.rs b/src/cp.rs deleted file mode 100644 index 5475d76..0000000 --- a/src/cp.rs +++ /dev/null @@ -1,1258 +0,0 @@ -//! Control-plane mode — fleet registry, agent registration, dashboard, shell. -//! -//! One HTTP port (`$DD_PORT`, default 8080) behind the CP's own CF tunnel. -//! On startup we: self-provision a CF tunnel at `$DD_HOSTNAME`, spawn -//! cloudflared, STONITH any older CP (by `dd-{env}-cp-*` name prefix), -//! provision CF Access apps + a shared service token, start the -//! self-watchdog and collector, then serve the router. No app-layer -//! auth — CF Access validates every request at the edge. - -use std::collections::HashMap; -use std::sync::Arc; -use std::time::{Duration, Instant}; - -use axum::extract::{Path, Query, State}; -use axum::response::{Html, IntoResponse, Response}; -use axum::routing::{get, post}; -use axum::{Json, Router}; -use serde::Deserialize; -use tokio::sync::{Mutex, RwLock}; - -use crate::cf; -use crate::collector::{self, Store}; -use crate::config::Cp as Cfg; -use crate::ee::Ee; -use crate::error::{Error, Result}; -use crate::html::{self, shell}; -use crate::ita; -use crate::metrics; -use crate::noise_gateway; -use crate::stonith; - -/// Re-mint interval for the CP's own ITA token. The CP isn't scraped -/// by its own collector (different tunnel prefix), so a background -/// task is the only thing keeping the `control-plane` entry's claims -/// fresh on the dashboard. -const ITA_REFRESH: Duration = Duration::from_secs(180); - -#[derive(Clone)] -struct St { - cfg: Arc, - ee: Arc, - store: Store, - started: Instant, - verifier: Arc, - /// The CP's own ITA token. Refreshed by a background task. - cp_ita_token: Arc>, - /// GH OIDC verifier for `/api/agents` callers (CI, humans). Same - /// audience as dd-agent, shared owner claim. - gh: Arc, - /// Paired device pubkeys. Mutations persist to disk and emit a - /// runtime view for the local ee-proxy workload. - devices: Arc, - /// TDX-quote + Noise-static-pubkey bundle. Surfaced by `/health` - /// as `{ noise: { quote_b64, pubkey_hex } }` so a bastion-app - /// bootstraps in one fetch (the former standalone `/attest` - /// endpoint was folded in). Shared `Arc` with the Noise gateway - /// module's handshake responder — one keypair / one quote per - /// boot. - attest: Arc, -} - -pub async fn run() -> Result<()> { - let cfg = Arc::new(Cfg::from_env()?); - let ee = Arc::new(Ee::new("/var/lib/easyenclave/agent.sock")); - - let http = reqwest::Client::new(); - - // Stage 1: mint + verify our own ITA token before touching CF. - // We need the token as a Bearer for the hydrate call below; if - // the old CP is still serving at `cfg.hostname`, it'll verify - // our ITA and hand over its state. - let verifier = ita::Verifier::new(cfg.ita.jwks_url.clone(), cfg.ita.issuer.clone()); - eprintln!("cp: ITA verifier enabled (issuer={})", cfg.ita.issuer); - let initial_token = match mint_cp_ita(&cfg, &ee).await { - Ok(t) => t, - Err(e) => { - eprintln!("cp: ITA mint failed: {e}"); - stonith::poweroff(); - } - }; - let cp_claims_initial = match verifier.verify(&initial_token).await { - Ok(c) => c, - Err(e) => { - eprintln!("cp: ITA self-verify failed: {e}"); - stonith::poweroff(); - } - }; - eprintln!( - "cp: own ITA verified mrtd={} tcb={}", - cp_claims_initial.mrtd.as_deref().unwrap_or("?"), - cp_claims_initial.tcb_status.as_deref().unwrap_or("?") - ); - - // Stage 2: hydrate state from the predecessor CP if one is live - // at `cfg.hostname`. DNS hasn't flipped yet — any existing CNAME - // still points at the old CP's tunnel, so this GET lands on the - // old CP. Tolerant of failure (first boot, old CP already dead, - // stale code without `/api/v1/admin/export`). - // - // Gated on "does a predecessor CP tunnel exist for this env?" - // because on a *fresh* env (first PR deploy, or after cleanup - // reaped the old tunnel) the hostname has no CNAME, `getaddrinfo` - // returns NXDOMAIN, and the host's libvirt dnsmasq at - // 192.168.122.1 negative-caches that — with a default neg-TTL of - // minutes. Stage 3 below then creates the CNAME, but every VM on - // the default network (including the dd-local-preview agent that - // `release.yml` spins up right after) keeps seeing the cached - // NXDOMAIN until the TTL expires, blowing past the agent's - // register-retry budget and dying fatal. We diagnosed this by - // (a) the agent's `Name does not resolve` spam in its serial log - // while (b) the same hostname resolving from the CI runner, then - // (c) confirming via `dig @192.168.122.1`. The `cf::list` probe - // is against `api.cloudflare.com` (always resolves), so it's - // safe to run unconditionally — it doesn't touch the poisonable - // hostname at all. - let store: Store = Arc::new(Mutex::new(HashMap::new())); - let trust = noise_gateway::new_trust_handle(); - let devices = crate::devices::Store::load(cfg.devices_path.clone(), trust.clone()) - .await - .map_err(|e| Error::Internal(format!("devices store load: {e}")))?; - let predecessor_prefix = cf::cp_prefix(&cfg.common.env_label); - let has_predecessor = match cf::list(&http, &cfg.cf).await { - Ok(tunnels) => tunnels.iter().any(|t| { - t["name"] - .as_str() - .is_some_and(|n| n.starts_with(&predecessor_prefix)) - }), - Err(e) => { - // Ambiguous — CF API unreachable. Skip hydrate rather than - // probe-and-poison; the CP's own tunnel Stage 3 creates - // below doesn't depend on this branch, and a missed - // hydrate is strictly less bad than a poisoned dnsmasq. - eprintln!("cp: predecessor probe failed ({e}); skipping hydrate"); - false - } - }; - if has_predecessor { - hydrate_from_peer(&http, &cfg.hostname, &initial_token, &devices, &store).await; - } else { - eprintln!("cp: no predecessor {predecessor_prefix}* tunnel — skipping hydrate (fresh env)"); - } - - // Stage 3: create our own tunnel. `cf::create` upserts the CNAME - // for `cfg.hostname` → our tunnel id; traffic moves to us the - // moment CF's edge propagates that change. - eprintln!("cp: self-provisioning tunnel for {}", cfg.hostname); - let self_name = cf::cp_tunnel_name(&cfg.common.env_label); - let cp_extras: Vec<(String, u16)> = vec![("block".into(), 7681)]; - let tunnel = match cf::create(&http, &cfg.cf, &self_name, &cfg.hostname, &cp_extras).await { - Ok(t) => t, - Err(e) => { - eprintln!("cp: self-register failed: {e}"); - stonith::poweroff(); - } - }; - eprintln!("cp: self tunnel {} → {}", tunnel.id, tunnel.hostname); - - spawn_cloudflared(tunnel.token.clone()); - - { - let http = http.clone(); - let cf = cfg.cf.clone(); - let id = tunnel.id.clone(); - let env = cfg.common.env_label.clone(); - tokio::spawn(async move { - tokio::time::sleep(Duration::from_secs(5)).await; - stonith::kill_old_tunnels(&http, &cf, &id, &env).await; - }); - } - - // Graceful shutdown signal. The watchdog triggers it when the - // tunnel's been reaped by a successor CP; axum::serve below - // awaits it and drains in-flight connections before exiting. - let (shutdown_tx, mut shutdown_rx) = tokio::sync::broadcast::channel::<()>(1); - tokio::spawn(stonith::self_watchdog( - cfg.cf.clone(), - tunnel.id.clone(), - shutdown_tx.clone(), - )); - - // Stage 4: provision CF Access apps for our own tunnel. Workload - // labels (e.g. `block` for ttyd) get the human policy; the - // paths in the bypass list (`/register`, `/api/agents`, - // `/api/v1/devices/trusted`, `/api/v1/admin/export`, `/noise/ws`, - // `/health`, …) are CF-bypassed with in-code gating. `/health` - // also carries the Noise pre-handshake quote + pubkey (the former - // `/attest` — now inlined to save a bootstrap round-trip). - let cp_labels: Vec = cp_extras.iter().map(|(l, _)| l.clone()).collect(); - if let Err(e) = cf::provision_cp_access( - &http, - &cfg.cf, - &cfg.common.env_label, - &cfg.hostname, - &cfg.common.owner, - &cfg.access.admin_email, - &cp_labels, - ) - .await - { - eprintln!("cp: CF Access provisioning failed: {e}"); - stonith::poweroff(); - } - eprintln!("cp: CF Access ready"); - let cp_ita_token = Arc::new(RwLock::new(initial_token)); - - // Seed the CP into the store before the collector starts ticking. - // The refresh loop below will keep these numbers current every - // ITA_REFRESH ticks; we fill them once at startup so the first - // render isn't all zeros while the refresh loop sleeps. - let cp_m = metrics::collect().await; - store.lock().await.insert( - "control-plane".into(), - collector::Agent { - agent_id: "control-plane".into(), - hostname: cfg.hostname.clone(), - vm_name: format!("dd-{}-cp", cfg.common.env_label), - attestation_type: "tdx".into(), - status: "healthy".into(), - last_seen: chrono::Utc::now(), - deployment_count: 0, - deployment_names: Vec::new(), - cpu_percent: cp_m.cpu_pct, - memory_used_mb: cp_m.mem_used_mb, - memory_total_mb: cp_m.mem_total_mb, - nets: cp_m.nets, - disks: cp_m.disks, - ita: cp_claims_initial, - // CP doesn't take per-workload runtime ingress — its own tunnel - // only routes `DD_HOSTNAME → localhost:8080`. tunnel_id stays - // empty so the runtime-ingress endpoint rejects attempts to - // target "control-plane". - tunnel_id: String::new(), - extras: Vec::new(), - }, - ); - - // Background re-mint of the CP's own ITA token. Also re-verifies - // and updates the `control-plane` store entry so the fleet card - // doesn't show "Expired". - { - let cfg = cfg.clone(); - let ee = ee.clone(); - let verifier = verifier.clone(); - let token = cp_ita_token.clone(); - let store = store.clone(); - tokio::spawn(async move { - loop { - tokio::time::sleep(ITA_REFRESH).await; - let fresh = match mint_cp_ita(&cfg, &ee).await { - Ok(t) => t, - Err(e) => { - eprintln!("cp: own ITA refresh mint failed: {e}"); - continue; - } - }; - let claims = match verifier.verify(&fresh).await { - Ok(c) => c, - Err(e) => { - eprintln!("cp: own ITA refresh verify failed: {e}"); - continue; - } - }; - *token.write().await = fresh; - // Also refresh live system metrics on the CP's own - // store entry — the collector scrapes other tunnels, - // not itself, so without this the /agent/control-plane - // detail page would show empty disks/nets/cpu forever. - let m = crate::metrics::collect().await; - if let Some(cp) = store.lock().await.get_mut("control-plane") { - cp.ita = claims; - cp.last_seen = chrono::Utc::now(); - cp.cpu_percent = m.cpu_pct; - cp.memory_used_mb = m.mem_used_mb; - cp.memory_total_mb = m.mem_total_mb; - cp.nets = m.nets; - cp.disks = m.disks; - } - eprintln!("cp: own ITA token refreshed"); - } - }); - } - - // Start the collector with the verifier. It re-verifies each - // scraped agent's ita_token, so expired / revoked / unsigned - // agents drop off the dashboard automatically. - tokio::spawn(collector::run( - store.clone(), - cfg.cf.clone(), - cfg.common.env_label.clone(), - cfg.hostname.clone(), - ee.clone(), - verifier.clone(), - Duration::from_secs(cfg.scrape_interval_secs), - )); - - let gh = crate::gh_oidc::Verifier::new(cfg.common.owner.clone(), "dd-agent".into()); - - // Noise gateway state. `devices` already loaded in Stage 2 + any - // inherited records merged in; `trust` is already populated. - let attestor = Arc::new( - noise_gateway::attest::Attestor::load_or_mint(&cfg.noise_key_path) - .await - .map_err(|e| Error::Internal(format!("noise keypair: {e}")))?, - ); - eprintln!("cp: noise_pubkey={}", hex::encode(attestor.public_key())); - let ee_token = std::env::var("EE_TOKEN").ok(); - let upstream = Arc::new(noise_gateway::upstream::EeAgent::new( - std::path::PathBuf::from(noise_gateway::upstream::DEFAULT_EE_AGENT_SOCK), - ee_token, - )); - let ng_state = noise_gateway::State { - attest: attestor.clone(), - trust: trust.clone(), - upstream, - }; - - let state = St { - cfg: cfg.clone(), - ee, - store, - started: Instant::now(), - verifier, - cp_ita_token, - gh, - devices, - attest: attestor, - }; - - let app = Router::new() - .route("/", get(fleet)) - .route("/health", get(health)) - .route("/register", post(register)) - .route("/ingress/replace", post(ingress_replace)) - .route("/agent/{id}", get(agent_detail)) - .route("/agent/{id}/logs/{app}", get(agent_logs)) - .route("/api/agents", get(api_agents)) - .route("/api/v1/devices", post(create_device)) - .route("/api/v1/devices/trusted", get(list_trusted_devices)) - .route( - "/api/v1/devices/{pubkey}", - axum::routing::delete(revoke_device), - ) - .route("/api/v1/admin/export", get(export_state)) - .route("/admin/enroll", get(enroll_page)) - .with_state(state) - .merge(noise_gateway::router(ng_state)); - - let addr = format!("0.0.0.0:{}", cfg.common.port); - eprintln!("cp: listening on {addr}"); - let listener = tokio::net::TcpListener::bind(&addr).await?; - // `into_make_service_with_connect_info` so handlers can see the - // peer socket address via the `ConnectInfo` extractor. Used by - // `api_agents` to grant auth-free access to same-VM loopback - // callers (same-VM workloads + dd-management proxy paths). - axum::serve( - listener, - app.into_make_service_with_connect_info::(), - ) - .with_graceful_shutdown(async move { - let _ = shutdown_rx.recv().await; - eprintln!("cp: graceful shutdown signaled; draining in-flight requests"); - }) - .await - .map_err(|e| Error::Internal(e.to_string())) -} - -fn spawn_cloudflared(token: String) { - tokio::spawn(async move { - eprintln!("cp: spawning cloudflared"); - match tokio::process::Command::new("cloudflared") - .args([ - "tunnel", - "--no-autoupdate", - "--metrics=", - "run", - "--token", - &token, - ]) - .spawn() - { - Ok(mut child) => { - let status = child.wait().await; - eprintln!("cp: cloudflared exited: {status:?} — poweroff"); - stonith::poweroff(); - } - Err(e) => { - eprintln!("cp: cloudflared spawn failed: {e} — poweroff"); - stonith::poweroff(); - } - } - }); -} - -/// Try to pull devices + agent snapshot from a predecessor CP still -/// serving at `hostname`. The CNAME hasn't flipped yet when this runs, -/// so any existing DNS record still points at the old CP's tunnel. -/// Failures (first boot, DNS miss, old code, timeout) are logged and -/// swallowed — deploy still proceeds as if fresh. -async fn hydrate_from_peer( - http: &reqwest::Client, - hostname: &str, - ita_token: &str, - devices: &crate::devices::Store, - agents: &Store, -) { - let url = format!("https://{hostname}/api/v1/admin/export"); - let resp = match http - .get(&url) - .bearer_auth(ita_token) - .timeout(Duration::from_secs(10)) - .send() - .await - { - Ok(r) => r, - Err(e) => { - eprintln!("cp: hydrate skipped ({url}): {e}"); - return; - } - }; - let status = resp.status(); - if !status.is_success() { - eprintln!("cp: hydrate skipped — {url} → {status}"); - return; - } - let body: serde_json::Value = match resp.json().await { - Ok(v) => v, - Err(e) => { - eprintln!("cp: hydrate parse failed ({url}): {e}"); - return; - } - }; - - let mut imported_devices = 0usize; - if let Some(arr) = body.get("devices").cloned() { - match serde_json::from_value::>(arr) { - Ok(devs) => { - let n = devs.len(); - if let Err(e) = devices.import_merge(devs).await { - eprintln!("cp: hydrate devices.import_merge: {e}"); - } else { - imported_devices = n; - } - } - Err(e) => eprintln!("cp: hydrate devices shape mismatch: {e}"), - } - } - - let mut imported_agents = 0usize; - if let Some(arr) = body.get("agents").cloned() { - match serde_json::from_value::>(arr) { - Ok(ags) => { - let mut store = agents.lock().await; - for a in ags { - store.insert(a.agent_id.clone(), a); - imported_agents += 1; - } - } - Err(e) => eprintln!("cp: hydrate agents shape mismatch: {e}"), - } - } - - eprintln!( - "cp: hydrated from {hostname} — {imported_devices} device(s), {imported_agents} agent(s)" - ); -} - -// ── Routes ────────────────────────────────────────────────────────────── - -async fn health( - State(s): State, - Query(q): Query>, -) -> Json { - use base64::Engine as _; - let agents = s.store.lock().await; - let mut body = serde_json::json!({ - "ok": true, - "service": "cp", - "hostname": s.cfg.hostname, - "env": s.cfg.common.env_label, - "uptime_secs": s.started.elapsed().as_secs(), - "agent_count": agents.len(), - "healthy_count": agents.values().filter(|a| a.status == "healthy").count(), - // Pre-Noise-handshake bundle — the former `GET /attest` - // endpoint folded in here so bastion-app bootstraps in one - // fetch and we drop a CF Access bypass-app per env × per - // service. Stable per boot; `Arc` clones are effectively free - // per request. `quote_b64` binds the raw Noise pubkey into - // TDX `report_data`, self-authenticating via ITA. - "noise": { - "quote_b64": base64::engine::general_purpose::STANDARD.encode(s.attest.quote()), - "pubkey_hex": hex::encode(s.attest.public_key()), - }, - }); - // `?verbose=1` folds in the CP's current ITA token so operators - // can inspect the CP VM's TDX measurement without a second route - // (the old `/cp/ita` + `/cp/attest` paths were removed; the - // TDX quote for the Noise pubkey is also above, unconditionally). - if q.get("verbose").map(|v| v.as_str()) == Some("1") { - if let Some(obj) = body.as_object_mut() { - obj.insert( - "cp_ita".into(), - serde_json::Value::String(s.cp_ita_token.read().await.clone()), - ); - } - } - Json(body) -} - -#[derive(Debug, Deserialize)] -struct RegisterReq { - vm_name: String, - ita_token: String, - /// Optional per-workload ingress: each entry becomes - /// `{hostname_label}.{agent_hostname}` → `localhost:{port}` in the - /// agent's cloudflared tunnel config, in addition to the default - /// `{agent_hostname}` → `localhost:8080` dashboard rule. - #[serde(default)] - extra_ingress: Vec, -} - -#[derive(Debug, Deserialize)] -struct ExtraIngress { - hostname_label: String, - port: u16, -} - -/// POST /register — the CF Access bypass app on this path lets anyone -/// reach it; the real gate is ITA attestation in-code. We verify the -/// agent's Intel-signed quote, create its tunnel, provision its CF -/// Access apps, and return the tunnel token. `owner` / `env_label` -/// used to be in the body for double-check; they're implicit now — -/// the CP authoritatively owns them from its config and the ITA -/// token authenticates the agent regardless. -async fn register( - State(s): State, - Json(req): Json, -) -> Result> { - // ITA is mandatory. Any failure → 401. - let ita_claims = s.verifier.verify(&req.ita_token).await?; - eprintln!( - "cp: ITA verified for {} mrtd={} tcb={}", - req.vm_name, - ita_claims.mrtd.as_deref().unwrap_or("?"), - ita_claims.tcb_status.as_deref().unwrap_or("?") - ); - - let http = reqwest::Client::new(); - let name = cf::agent_tunnel_name(&s.cfg.common.env_label); - let agent_hostname = format!("{name}.{}", s.cfg.cf.domain); - let extras: Vec<(String, u16)> = req - .extra_ingress - .iter() - .map(|e| (e.hostname_label.clone(), e.port)) - .collect(); - let tunnel = cf::create(&http, &s.cfg.cf, &name, &agent_hostname, &extras).await?; - if !tunnel.extra_hostnames.is_empty() { - eprintln!( - "cp: registered extra ingress for {}: {:?}", - req.vm_name, tunnel.extra_hostnames - ); - } - - let labels: Vec = extras.iter().map(|(l, _)| l.clone()).collect(); - if let Err(e) = cf::provision_agent_access( - &http, - &s.cfg.cf, - &s.cfg.common.env_label, - &agent_hostname, - &s.cfg.common.owner, - &s.cfg.access.admin_email, - &labels, - ) - .await - { - eprintln!("cp: provision_agent_access {agent_hostname} failed: {e}"); - } - - // Seed the store so the dashboard shows the agent before the first - // collector tick. Also evict any prior entries with the same - // vm_name — a relaunched VM registers a new agent_id/hostname, but - // the stale old one hangs around until the collector's dead - // threshold (5 min). During that window `/api/agents` returns - // duplicates and host-side scripts can pick the dead hostname. - let now = chrono::Utc::now(); - { - let mut store = s.store.lock().await; - let stale: Vec = store - .iter() - .filter(|(id, a)| a.vm_name == req.vm_name && id.as_str() != name) - .map(|(id, _)| id.clone()) - .collect(); - for id in stale { - store.remove(&id); - } - store.insert( - name.clone(), - collector::Agent { - agent_id: name.clone(), - hostname: tunnel.hostname.clone(), - vm_name: req.vm_name.clone(), - attestation_type: "tdx".into(), - status: "healthy".into(), - last_seen: now, - deployment_count: 0, - deployment_names: Vec::new(), - cpu_percent: 0, - memory_used_mb: 0, - memory_total_mb: 0, - nets: Vec::new(), - disks: Vec::new(), - ita: ita_claims, - // Seeded from the boot `extra_ingress`; runtime /deploy - // requests extend this list via /ingress/replace (below). - tunnel_id: tunnel.id.clone(), - extras: extras.clone(), - }, - ); - } - - eprintln!("cp: registered {} as {}", req.vm_name, agent_hostname); - - Ok(Json(serde_json::json!({ - "tunnel_token": tunnel.token, - "hostname": tunnel.hostname, - "agent_id": name, - }))) -} - -#[derive(Debug, Deserialize)] -struct IngressReplaceReq { - /// The agent's own `agent_id` (== tunnel name) as returned from - /// /register. Used to look up the tunnel id in the CP's store. - agent_id: String, - /// Fresh Intel-signed attestation token from the agent — same - /// shape as /register, re-presented here because this endpoint - /// is CF Access-bypassed and the ITA verification is the auth. - /// The agent already refreshes this token every few minutes for - /// /health, so forwarding it on each call is trivial. - ita_token: String, - /// Full replacement set of per-workload ingress rules for this - /// agent. The CP re-PUTs the tunnel config with `extras` first, - /// the primary `hostname → localhost:8080` rule, and the 404 - /// catch-all. Runtime additions from /deploy live alongside the - /// boot-time `extra_ingress` from /register here — the agent - /// owns the merge. - extras: Vec, -} - -#[derive(Debug, Deserialize)] -struct IngressPair { - hostname_label: String, - port: u16, -} - -/// POST /ingress/replace — CF-Access-bypassed; authenticated by the -/// same Intel ITA token the agent already refreshes for /health. -/// The agent forwards its full current ingress list; the CP re-PUTs -/// the tunnel config + CNAMEs and reconciles per-workload CF Access -/// bypass apps (creates new, deletes stale). -async fn ingress_replace( - State(s): State, - Json(req): Json, -) -> Result> { - // ITA is the auth — any failure → 401. - let _claims = s.verifier.verify(&req.ita_token).await?; - - let (tunnel_id, hostname) = { - let store = s.store.lock().await; - let agent = store.get(&req.agent_id).ok_or(Error::NotFound)?; - if agent.tunnel_id.is_empty() { - // Control-plane pseudo-entry, or an older store entry that - // pre-dates the tunnel_id field. Either way, nothing to update. - return Err(Error::BadRequest(format!( - "{} has no tunnel — runtime ingress applies only to agent tunnels", - req.agent_id - ))); - } - (agent.tunnel_id.clone(), agent.hostname.clone()) - }; - - let extras: Vec<(String, u16)> = req - .extras - .iter() - .map(|e| (e.hostname_label.clone(), e.port)) - .collect(); - - let http = reqwest::Client::new(); - let hostnames = cf::update_ingress(&http, &s.cfg.cf, &tunnel_id, &hostname, &extras).await?; - - let labels: Vec = extras.iter().map(|(l, _)| l.clone()).collect(); - if let Err(e) = cf::provision_agent_access( - &http, - &s.cfg.cf, - &s.cfg.common.env_label, - &hostname, - &s.cfg.common.owner, - &s.cfg.access.admin_email, - &labels, - ) - .await - { - eprintln!("cp: provision_agent_access on /ingress/replace failed: {e}"); - } - - { - let mut store = s.store.lock().await; - if let Some(agent) = store.get_mut(&req.agent_id) { - agent.extras = extras; - } - } - - eprintln!("cp: ingress/replace {} → {:?}", req.agent_id, hostnames); - Ok(Json(serde_json::json!({ - "agent_id": req.agent_id, - "extra_hostnames": hostnames, - }))) -} - -/// Mint the CP's own ITA token at startup. Fatal on any failure — -/// the CP refuses to start without proving its own TDX measurement. -async fn mint_cp_ita(cfg: &Cfg, ee: &Ee) -> Result { - use base64::Engine; - let nonce = base64::engine::general_purpose::STANDARD.encode(uuid::Uuid::new_v4().as_bytes()); - let quote_b64 = ee.attest(&nonce).await?["quote_b64"] - .as_str() - .ok_or_else(|| Error::Upstream("EE attest returned no quote_b64".into()))? - .to_string(); - ita::mint(&cfg.ita.base_url, &cfg.ita.api_key, "e_b64).await -} - -// ── Fleet dashboard ────────────────────────────────────────────────────── - -async fn fleet(State(s): State) -> Response { - let agents = s.store.lock().await.clone(); - let mut rows = String::new(); - let mut by_id: Vec<_> = agents.into_iter().collect(); - by_id.sort_by(|a, b| a.0.cmp(&b.0)); - for (_, a) in &by_id { - let mem = if a.memory_total_mb > 0 { - format!("{}/{} MB", a.memory_used_mb, a.memory_total_mb) - } else { - "—".into() - }; - rows.push_str(&format!( - r#"{vm} -{st}{att} -{cpu}%{mem}{n} -{host}"#, - id = html::escape(&a.agent_id), - vm = html::escape(&a.vm_name), - st = html::escape(&a.status), - att = html::escape(&a.attestation_type), - cpu = a.cpu_percent, - n = a.deployment_count, - host = html::escape(&a.hostname), - )); - } - - let table = if by_id.is_empty() { - r#"
No agents registered
"#.to_string() - } else { - format!( - r#"{rows}
vmstatusattcpumemwlhost
"# - ) - }; - - Html(shell( - "DD Fleet", - &html::nav(&[("Fleet", "/", true)]), - &format!( - r#"

Fleet

{host} · env {env} · {n} agent(s)
{table}"#, - host = html::escape(&s.cfg.hostname), - env = html::escape(&s.cfg.common.env_label), - n = by_id.len(), - ), - )) - .into_response() -} - -// ── Devices API ───────────────────────────────────────────────────────── -// -// Paired client-device X25519 pubkeys that the local Noise gateway -// accepts during the handshake. POST + DELETE are behind the CP's -// human CF Access app (admin enrollment); the machine-readable -// `/trusted` view is edge-bypassed for cross-VM agent polls. - -/// GET /api/v1/admin/export — full state snapshot for a successor CP -/// to hydrate from during a zero-downtime deploy. Returns the -/// device registry (full records, including revoked) and the live -/// agents HashMap. CF-Access-bypassed at the edge; gated in-code by -/// a valid owner-scoped ITA Bearer (any attested enclave in the -/// fleet can authenticate). The new CP calls this against the old -/// CP's still-pointed DNS before flipping CNAMEs. -async fn export_state( - State(s): State, - headers: axum::http::HeaderMap, -) -> Result> { - let bearer = headers - .get(axum::http::header::AUTHORIZATION) - .and_then(|v| v.to_str().ok()) - .and_then(|v| v.strip_prefix("Bearer ")) - .ok_or(Error::Unauthorized)?; - // Any owner-scoped ITA token is OK for v0; tighten to a pinned - // MRTD list once we stop rotating measurements every dev push. - let _ = s.verifier.verify(bearer).await?; - - let devices = s.devices.export_full().await; - let agents: Vec = s.store.lock().await.values().cloned().collect(); - Ok(Json(serde_json::json!({ - "devices": devices, - "agents": agents, - }))) -} - -/// GET /api/v1/devices/trusted — minimal, machine-readable view: -/// `{ "pubkeys": ["", ...] }` with only currently-trusted keys. -/// CF-Access-bypassed at the edge so cross-VM dd-agent callers can -/// reach it; gated in-code by the same three-way policy as -/// `/api/agents`. This is the agent's poll target for mirroring the -/// trust list into its in-memory `TrustHandle`. -async fn list_trusted_devices( - State(s): State, - axum::extract::ConnectInfo(peer): axum::extract::ConnectInfo, - headers: axum::http::HeaderMap, -) -> Result> { - if !agents_auth_ok(&s, peer, &headers).await { - return Err(Error::Unauthorized); - } - let devices = s.devices.list().await; - let pubkeys: Vec = devices - .into_iter() - .filter(|d| d.revoked_at_ms.is_none()) - .map(|d| d.pubkey) - .collect(); - Ok(Json(serde_json::json!({ "pubkeys": pubkeys }))) -} - -#[derive(Debug, Deserialize)] -struct CreateDeviceReq { - pubkey: String, - label: String, -} - -/// POST /api/v1/devices — enroll a device pubkey. Idempotent on -/// pubkey: re-posting with a new label replaces the record in place. -async fn create_device( - State(s): State, - Json(req): Json, -) -> Result<(axum::http::StatusCode, Json)> { - let pubkey = req.pubkey.to_lowercase(); - crate::devices::validate_hex_pubkey(&pubkey).map_err(|e| Error::BadRequest(e.to_string()))?; - let label = req.label.trim().to_string(); - if label.is_empty() || label.len() > 128 { - return Err(Error::BadRequest("label must be 1..=128 chars".into())); - } - let device = crate::devices::Device { - pubkey, - label, - created_at_ms: chrono::Utc::now().timestamp_millis(), - revoked_at_ms: None, - }; - s.devices - .upsert(device.clone()) - .await - .map_err(|e| Error::Internal(format!("devices upsert: {e}")))?; - Ok((axum::http::StatusCode::CREATED, Json(device))) -} - -/// DELETE /api/v1/devices/{pubkey} — revoke. Returns 404 if the -/// pubkey isn't known or was already revoked. -async fn revoke_device( - State(s): State, - Path(pubkey): Path, -) -> Result> { - let pubkey = pubkey.to_lowercase(); - let now = chrono::Utc::now().timestamp_millis(); - let ok = s - .devices - .revoke(&pubkey, now) - .await - .map_err(|e| Error::Internal(format!("devices revoke: {e}")))?; - if !ok { - return Err(Error::NotFound); - } - Ok(Json(serde_json::json!({ - "revoked": pubkey, - "at_ms": now, - }))) -} - -/// GET /admin/enroll?pubkey=…&label=… — human-facing confirmation -/// page that a `bastion-app` (CLI or desktop) bounces the operator -/// to. Behind the CP's human CF Access app: by the time this -/// handler renders, the browser has a valid CF Access session -/// cookie. The rendered page POSTs to `/api/v1/devices` with the -/// same cookie via `credentials: "same-origin"`, completing the -/// enrollment that headless clients can't do themselves. -/// -/// Intent-over-GET: we deliberately don't enroll on page load — -/// the user clicks Confirm so a copy-pasted link can't silently -/// add a pubkey. -async fn enroll_page(Query(q): Query>) -> Response { - let pubkey = q.get("pubkey").cloned().unwrap_or_default(); - let label = q.get("label").cloned().unwrap_or_default(); - - if let Err(e) = crate::devices::validate_hex_pubkey(&pubkey) { - return Html(shell( - "Enroll device", - "", - &format!( - r#"

Invalid pubkey

{}

"#, - html::escape(&e.to_string()) - ), - )) - .into_response(); - } - if label.trim().is_empty() || label.len() > 128 { - return Html(shell( - "Enroll device", - "", - r#"

Invalid label

label must be 1..=128 chars

"#, - )) - .into_response(); - } - - let short = &pubkey[..16]; - let body = format!( - r#"
-

Enroll this device?

-
Label{label}
-
Pubkey{short}…
-

- Confirming adds this X25519 public key to the trust list. Every - DD agent mirrors that list within 30 s; thereafter, a client - holding the matching private key can open Noise_IK sessions to - any enclave in the fleet. Revoke any time with - DELETE /api/v1/devices/<pubkey>. -

-

-
- - Cancel -
-
-"#, - label = html::escape(&label), - short = html::escape(short), - pubkey_js = serde_json::to_string(&pubkey).unwrap_or_else(|_| "\"\"".into()), - label_js = serde_json::to_string(&label).unwrap_or_else(|_| "\"\"".into()), - ); - - Html(shell("Enroll device", "", &body)).into_response() -} - -/// GET /api/agents — JSON list of -/// `{agent_id, vm_name, hostname, status, last_seen}`. -/// -/// Gated three ways, any of which succeeds: -/// 1. **Loopback** (`127.0.0.1`, `::1`) — same-VM callers (any -/// workload on the CP VM, dd-agent's own proxy). Trust is anchored -/// by EE's Tier-1 seal: a process on the VM at all is already a -/// workload EE spawned and gave the shared `EE_TOKEN` env to. -/// 2. **GH OIDC** — CI action (`dd-deploy`, etc.) presents a GitHub -/// Actions OIDC JWT as `Authorization: Bearer `; we verify -/// against GitHub's JWKS and require the principal carried by -/// `DD_OWNER`/`DD_OWNER_ID`/`DD_OWNER_KIND` (see -/// [`gh_oidc::Principal::matches`]). Matches the pattern dd-agent -/// uses for `/deploy` + `/exec`. -/// 3. **ITA** — dd-agent's `/api/agents` proxy forwards its own -/// Intel-attested ITA token so cross-VM calls from any attested -/// DD agent in the fleet succeed. -/// -/// Without one of those, respond with 401. -async fn api_agents( - State(s): State, - axum::extract::ConnectInfo(peer): axum::extract::ConnectInfo, - headers: axum::http::HeaderMap, -) -> Result>> { - if !agents_auth_ok(&s, peer, &headers).await { - return Err(Error::Unauthorized); - } - let agents = s.store.lock().await.clone(); - Ok(Json( - agents - .into_values() - .map(|a| { - serde_json::json!({ - "agent_id": a.agent_id, - "vm_name": a.vm_name, - "hostname": a.hostname, - "status": a.status, - "last_seen": a.last_seen.to_rfc3339(), - }) - }) - .collect(), - )) -} - -/// Accept the request if the caller is on the loopback interface -/// (same-VM trust — any CP-VM workload / dd-agent-proxy) or presents a valid -/// bearer that verifies as either a GitHub Actions OIDC token for -/// this owner, or a fresh Intel-signed ITA token for this CP. See -/// [`api_agents`] for the full policy. -async fn agents_auth_ok( - s: &St, - peer: std::net::SocketAddr, - headers: &axum::http::HeaderMap, -) -> bool { - if peer.ip().is_loopback() { - return true; - } - let bearer = headers - .get(axum::http::header::AUTHORIZATION) - .and_then(|v| v.to_str().ok()) - .and_then(|s| { - s.strip_prefix("Bearer ") - .or_else(|| s.strip_prefix("bearer ")) - }) - .map(str::trim) - .filter(|t| !t.is_empty()); - let Some(token) = bearer else { - return false; - }; - // `gh.verify` already enforces the principal match — a successful - // result is itself the authorization. - if s.gh.verify(token).await.is_ok() { - return true; - } - if s.verifier.verify(token).await.is_ok() { - return true; - } - false -} - -async fn agent_detail(State(s): State, Path(id): Path) -> Response { - let agent = s.store.lock().await.get(&id).cloned(); - let Some(a) = agent else { - return ( - axum::http::StatusCode::NOT_FOUND, - Html(shell("Not found", "", "

Not found

")), - ) - .into_response(); - }; - - let is_cp = a.agent_id == "control-plane"; - let mut workloads = String::new(); - for w in &a.deployment_names { - let link = if is_cp { - format!( - r#"logs"#, - id = html::escape(&a.agent_id), - w = html::escape(w) - ) - } else { - String::new() - }; - workloads.push_str(&format!( - r#"{w}{link}"#, - w = html::escape(w) - )); - } - let wl_table = if a.deployment_names.is_empty() { - r#"
No workloads
"#.to_string() - } else { - format!(r#"{workloads}
workload
"#) - }; - - let ita_card = { - let c = &a.ita; - let tcb = c.tcb_status.as_deref().unwrap_or("?"); - let tcb_cls = match tcb { - "UpToDate" | "OK" => "running", - "OutOfDate" | "SWHardeningNeeded" | "ConfigurationNeeded" => "deploying", - "Revoked" | "Invalid" => "failed", - _ => "idle", - }; - let mrtd_short = c - .mrtd - .as_deref() - .map(|m| if m.len() > 16 { &m[..16] } else { m }) - .unwrap_or("?"); - let delta = c.exp - chrono::Utc::now().timestamp(); - let expiry = if delta > 0 { - format!("in {}m", delta / 60) - } else { - "expired".to_string() - }; - format!( - r#"
Intel Trust Authority
-
-
TCB status{tcb}
-
MRTD{mrtd}…
-
Attester type{typ}
-
Expires{exp}
-
"#, - cls = tcb_cls, - tcb = html::escape(tcb), - mrtd = html::escape(mrtd_short), - typ = html::escape(c.attester_type.as_deref().unwrap_or("?")), - exp = expiry, - ) - }; - - // `{hostname-base}-block.{tld}` is the ttyd subdomain (CP's own - // tunnel publishes it; agents publish it via their register-time - // `extra_ingress`). Flat shape so Universal SSL covers the cert. - // Human-gated by CF Access. - let term_host = html::escape(&cf::label_hostname(&a.hostname, "block")); - let extra = if is_cp { - format!( - r#"

Terminal ↗ · health (incl. noise quote) · health?verbose=1 (incl. ita)

"# - ) - } else { - format!( - r#"

open agent dashboard ↗ · Terminal ↗

"#, - h = html::escape(&a.hostname) - ) - }; - - let disks_table = if a.disks.is_empty() { - String::new() - } else { - let mut rows = String::new(); - for d in &a.disks { - rows.push_str(&format!( - "{m}{fs}{u} / {t}", - m = html::escape(&d.mount), - fs = html::escape(&d.fstype), - u = metrics::format_bytes_si(d.used_bytes), - t = metrics::format_bytes_si(d.total_bytes), - )); - } - format!( - r#"
Disks
{rows}
mountfsused
"# - ) - }; - - let nets_table = if a.nets.is_empty() { - String::new() - } else { - let mut rows = String::new(); - for n in &a.nets { - rows.push_str(&format!( - "{i}{rx}{tx}", - i = html::escape(&n.iface), - rx = metrics::format_bytes_si(n.rx_bytes), - tx = metrics::format_bytes_si(n.tx_bytes), - )); - } - format!( - r#"
Network
{rows}
ifacerxtx
"# - ) - }; - - Html(shell( - &format!("DD — {}", a.vm_name), - &html::nav(&[("Fleet", "/", false)]), - &format!( - r#" -

{vm}

{id} · {host}
-
-
Status{st}
-
Attestation{att}
-
Last seen{ls}
-
CPU{cpu}%
-
Memory{mu}/{mt} MB
-
-{disks_table} -{nets_table} -{ita_card} -
Workloads
{wl_table} -{extra}"#, - vm = html::escape(&a.vm_name), - id = html::escape(&a.agent_id), - host = html::escape(&a.hostname), - st = html::escape(&a.status), - att = html::escape(&a.attestation_type), - ls = a.last_seen.to_rfc3339(), - cpu = a.cpu_percent, - mu = a.memory_used_mb, - mt = a.memory_total_mb, - disks_table = disks_table, - nets_table = nets_table, - ita_card = ita_card, - ), - )) - .into_response() -} - -/// GET /agent/control-plane/logs/{app} — show logs for a CP workload via the -/// local easyenclave socket. For other agents we'd proxy to their dashboard; -/// today the detail page links directly there instead. -async fn agent_logs(State(s): State, Path((id, app)): Path<(String, String)>) -> Response { - if id != "control-plane" { - return Error::NotFound.into_response(); - } - // Find the workload by app_name. - let list = s.ee.list().await.unwrap_or_default(); - let dep_id = list["deployments"] - .as_array() - .and_then(|a| a.iter().find(|d| d["app_name"].as_str() == Some(&app))) - .and_then(|d| d["id"].as_str()) - .map(String::from); - let Some(dep_id) = dep_id else { - return Error::NotFound.into_response(); - }; - let logs = s.ee.logs(&dep_id).await.unwrap_or_default(); - let text = logs["lines"] - .as_array() - .map(|a| { - a.iter() - .filter_map(|v| v.as_str()) - .map(html::escape) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - Html(shell( - &format!("{app} logs"), - &html::nav(&[("Fleet", "/", false)]), - &format!( - r#" -

{app}

auto-refresh 2s
-
{text}
-"#, - app = html::escape(&app) - ), - )) - .into_response() -} diff --git a/src/devices.rs b/src/devices.rs deleted file mode 100644 index baa6f27..0000000 --- a/src/devices.rs +++ /dev/null @@ -1,298 +0,0 @@ -//! Device pubkey registry. -//! -//! Holds the X25519 pubkeys of paired client devices. Source of truth -//! lives on the CP's disk at [`Store::path`] (JSON, pretty-printed for -//! human editability in a pinch). The live set of *non-revoked* -//! pubkeys is also mirrored into a [`noise_gateway::TrustHandle`] so -//! the locally-running Noise gateway can read it directly from shared -//! memory — no on-disk runtime view, no cross-process file contract. -//! -//! Wire format on disk: -//! ```json -//! { -//! "devices": [ -//! { "pubkey": "<64-hex>", "label": "alice@laptop", -//! "created_at_ms": 1734567890000, "revoked_at_ms": null } -//! ] -//! } -//! ``` - -use std::collections::{BTreeMap, HashSet}; -use std::path::{Path, PathBuf}; -use std::sync::Arc; - -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; - -use crate::noise_gateway::TrustHandle; - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct Device { - pub pubkey: String, - pub label: String, - pub created_at_ms: i64, - #[serde(default, skip_serializing_if = "Option::is_none")] - pub revoked_at_ms: Option, -} - -#[derive(Default, Serialize, Deserialize)] -struct OnDisk { - #[serde(default)] - devices: Vec, -} - -pub struct Store { - path: PathBuf, - inner: RwLock>, - trust: TrustHandle, -} - -impl Store { - /// Load the source-of-truth file (missing is fine — starts empty) - /// and seed the shared `TrustHandle` with the current non-revoked - /// set. Every mutation after this recomputes the handle in place. - pub async fn load(path: PathBuf, trust: TrustHandle) -> anyhow::Result> { - let devices = match tokio::fs::read(&path).await { - Ok(bytes) => { - let parsed: OnDisk = serde_json::from_slice(&bytes)?; - parsed.devices - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Vec::new(), - Err(e) => return Err(e.into()), - }; - let map: BTreeMap<_, _> = devices.into_iter().map(|d| (d.pubkey.clone(), d)).collect(); - let store = Arc::new(Self { - path, - inner: RwLock::new(map), - trust, - }); - store.sync_trust_handle().await; - Ok(store) - } - - pub async fn list(&self) -> Vec { - self.inner.read().await.values().cloned().collect() - } - - pub async fn upsert(&self, device: Device) -> anyhow::Result<()> { - { - let mut w = self.inner.write().await; - w.insert(device.pubkey.clone(), device); - } - self.flush_source().await?; - self.sync_trust_handle().await; - Ok(()) - } - - /// Marks `pubkey` as revoked at `now_ms`. Returns `true` if the - /// record existed and wasn't already revoked. - pub async fn revoke(&self, pubkey: &str, now_ms: i64) -> anyhow::Result { - let ok = { - let mut w = self.inner.write().await; - match w.get_mut(pubkey) { - Some(d) if d.revoked_at_ms.is_none() => { - d.revoked_at_ms = Some(now_ms); - true - } - _ => false, - } - }; - if ok { - self.flush_source().await?; - self.sync_trust_handle().await; - } - Ok(ok) - } - - /// Full snapshot — including revoked records — for `/api/v1/admin/export`. - pub async fn export_full(&self) -> Vec { - self.list().await - } - - /// Merge a batch of device records into the store. Each pubkey - /// overwrites any existing record (later `revoked_at_ms` wins via - /// plain overwrite since callers always hand us the latest). - /// Persists to disk + refreshes the trust handle. - pub async fn import_merge(&self, devices: Vec) -> anyhow::Result { - let mut n = 0; - { - let mut w = self.inner.write().await; - for d in devices { - w.insert(d.pubkey.clone(), d); - n += 1; - } - } - self.flush_source().await?; - self.sync_trust_handle().await; - Ok(n) - } - - async fn flush_source(&self) -> anyhow::Result<()> { - let devices: Vec = self.inner.read().await.values().cloned().collect(); - let on_disk = OnDisk { devices }; - let bytes = serde_json::to_vec_pretty(&on_disk)?; - atomic_write(&self.path, &bytes).await - } - - async fn sync_trust_handle(&self) { - let guard = self.inner.read().await; - let mut fresh: HashSet<[u8; 32]> = HashSet::with_capacity(guard.len()); - for d in guard.values() { - if d.revoked_at_ms.is_some() { - continue; - } - if let Ok(bytes) = hex::decode(&d.pubkey) { - if bytes.len() == 32 { - let mut k = [0u8; 32]; - k.copy_from_slice(&bytes); - fresh.insert(k); - } - } - } - *self.trust.write().await = fresh; - } -} - -async fn atomic_write(path: &Path, bytes: &[u8]) -> anyhow::Result<()> { - if let Some(parent) = path.parent() { - tokio::fs::create_dir_all(parent).await.ok(); - } - let tmp = path.with_extension("json.tmp"); - tokio::fs::write(&tmp, bytes).await?; - tokio::fs::rename(&tmp, path).await?; - Ok(()) -} - -/// Validate a hex pubkey: exactly 64 chars of lowercase or uppercase -/// hex, decoding to 32 bytes. -pub fn validate_hex_pubkey(s: &str) -> anyhow::Result<()> { - if s.len() != 64 { - anyhow::bail!("pubkey must be 64 hex chars (got {})", s.len()); - } - let bytes = hex::decode(s).map_err(|e| anyhow::anyhow!("not valid hex: {e}"))?; - if bytes.len() != 32 { - anyhow::bail!("pubkey must decode to 32 bytes (got {})", bytes.len()); - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::noise_gateway; - - fn mk_dev(pk: &str, label: &str) -> Device { - Device { - pubkey: pk.into(), - label: label.into(), - created_at_ms: 1, - revoked_at_ms: None, - } - } - - fn pk_bytes(hex_str: &str) -> [u8; 32] { - let v = hex::decode(hex_str).unwrap(); - let mut k = [0u8; 32]; - k.copy_from_slice(&v); - k - } - - #[tokio::test] - async fn upsert_persists_and_mirrors_trust_handle() { - let dir = tempfile::tempdir().unwrap(); - let src = dir.path().join("devices.json"); - let trust = noise_gateway::new_trust_handle(); - let store = Store::load(src.clone(), trust.clone()).await.unwrap(); - - let pk1 = "a".repeat(64); - let pk2 = "b".repeat(64); - store.upsert(mk_dev(&pk1, "laptop")).await.unwrap(); - store.upsert(mk_dev(&pk2, "phone")).await.unwrap(); - - // Source has both records. - let src_val: serde_json::Value = - serde_json::from_slice(&tokio::fs::read(&src).await.unwrap()).unwrap(); - assert_eq!(src_val["devices"].as_array().unwrap().len(), 2); - - // Trust handle reflects both pubkeys. - let live = trust.read().await; - assert!(live.contains(&pk_bytes(&pk1))); - assert!(live.contains(&pk_bytes(&pk2))); - assert_eq!(live.len(), 2); - } - - #[tokio::test] - async fn revoke_drops_from_trust_handle_but_keeps_source() { - let dir = tempfile::tempdir().unwrap(); - let src = dir.path().join("devices.json"); - let trust = noise_gateway::new_trust_handle(); - let store = Store::load(src.clone(), trust.clone()).await.unwrap(); - - let pk = "c".repeat(64); - store.upsert(mk_dev(&pk, "laptop")).await.unwrap(); - assert!(store.revoke(&pk, 99).await.unwrap()); - assert!(!store.revoke(&pk, 100).await.unwrap()); - - let src_val: serde_json::Value = - serde_json::from_slice(&tokio::fs::read(&src).await.unwrap()).unwrap(); - assert_eq!(src_val["devices"][0]["revoked_at_ms"].as_i64(), Some(99)); - - assert!(trust.read().await.is_empty()); - } - - #[tokio::test] - async fn load_persists_across_instances() { - let dir = tempfile::tempdir().unwrap(); - let src = dir.path().join("devices.json"); - let pk = "e".repeat(64); - { - let trust = noise_gateway::new_trust_handle(); - let s = Store::load(src.clone(), trust).await.unwrap(); - s.upsert(mk_dev(&pk, "desktop")).await.unwrap(); - } - let trust2 = noise_gateway::new_trust_handle(); - let s2 = Store::load(src.clone(), trust2.clone()).await.unwrap(); - let list = s2.list().await; - assert_eq!(list.len(), 1); - assert_eq!(list[0].pubkey, pk); - assert!(trust2.read().await.contains(&pk_bytes(&pk))); - } - - #[tokio::test] - async fn import_merge_roundtrips_with_export() { - let dir = tempfile::tempdir().unwrap(); - let src_a = dir.path().join("a.json"); - let src_b = dir.path().join("b.json"); - let trust_a = noise_gateway::new_trust_handle(); - let trust_b = noise_gateway::new_trust_handle(); - let a = Store::load(src_a, trust_a).await.unwrap(); - let b = Store::load(src_b, trust_b.clone()).await.unwrap(); - - let pk_live = "1".repeat(64); - let pk_revoked = "2".repeat(64); - a.upsert(mk_dev(&pk_live, "live")).await.unwrap(); - a.upsert(mk_dev(&pk_revoked, "gone")).await.unwrap(); - a.revoke(&pk_revoked, 42).await.unwrap(); - - let exported = a.export_full().await; - let merged = b.import_merge(exported).await.unwrap(); - assert_eq!(merged, 2); - - let list_b = b.list().await; - assert_eq!(list_b.len(), 2); - // Only the non-revoked pubkey is in B's trust handle. - let trust = trust_b.read().await; - assert_eq!(trust.len(), 1); - assert!(trust.contains(&pk_bytes(&pk_live))); - assert!(!trust.contains(&pk_bytes(&pk_revoked))); - } - - #[test] - fn validate_hex_pubkey_happy_and_sad() { - validate_hex_pubkey(&"0".repeat(64)).unwrap(); - validate_hex_pubkey(&"F".repeat(64)).unwrap(); - assert!(validate_hex_pubkey("").is_err()); - assert!(validate_hex_pubkey(&"0".repeat(63)).is_err()); - assert!(validate_hex_pubkey(&"g".repeat(64)).is_err()); - } -} diff --git a/src/ee.rs b/src/ee.rs deleted file mode 100644 index 01f7523..0000000 --- a/src/ee.rs +++ /dev/null @@ -1,119 +0,0 @@ -//! Unix-socket client for the easyenclave daemon. -//! -//! Newline-delimited JSON request/response, one exchange per connection. -//! `attach()` is special — after the JSON ack the socket carries raw PTY bytes. - -use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}; -use tokio::net::UnixStream; - -use crate::error::{Error, Result}; - -pub struct Ee { - path: String, - /// Boot token minted by easyenclave and handed to dd-agent via - /// `EE_TOKEN`. When set, every socket request carries - /// `"token": ""` so EE's seal (easyenclave#80) accepts us. - /// On unpatched EE images the field is ignored as an unknown key; - /// makes this change forward-compatible either direction. - token: Option, -} - -impl Ee { - pub fn new(path: impl Into) -> Self { - Self { - path: path.into(), - token: std::env::var("EE_TOKEN").ok().filter(|s| !s.is_empty()), - } - } - - async fn call(&self, mut req: serde_json::Value) -> Result { - if let Some(token) = &self.token { - req["token"] = serde_json::Value::String(token.clone()); - } - - let stream = UnixStream::connect(&self.path) - .await - .map_err(|e| Error::Upstream(format!("EE connect {}: {e}", self.path)))?; - let (rd, mut wr) = stream.into_split(); - - let mut buf = serde_json::to_vec(&req)?; - buf.push(b'\n'); - wr.write_all(&buf).await?; - wr.shutdown().await?; - - let mut line = String::new(); - BufReader::new(rd).read_line(&mut line).await?; - Ok(serde_json::from_str(line.trim())?) - } - - pub async fn health(&self) -> Result { - self.call(serde_json::json!({"method": "health"})).await - } - - pub async fn list(&self) -> Result { - self.call(serde_json::json!({"method": "list"})).await - } - - pub async fn logs(&self, id: &str) -> Result { - self.call(serde_json::json!({"method": "logs", "id": id})) - .await - } - - pub async fn attest(&self, nonce: &str) -> Result { - self.call(serde_json::json!({"method": "attest", "nonce": nonce})) - .await - } - - /// Deploy a workload at runtime. Spec is a workload object - /// (`app_name`, `github_release`/`cmd`, `env`, ...) — we just set - /// `method` and forward. - pub async fn deploy(&self, mut spec: serde_json::Value) -> Result { - spec["method"] = serde_json::json!("deploy"); - self.call(spec).await - } - - /// Run a command inside the enclave and capture stdout/stderr. - pub async fn exec(&self, cmd: &[String], timeout_secs: u64) -> Result { - self.call(serde_json::json!({ - "method": "exec", - "cmd": cmd, - "timeout_secs": timeout_secs, - })) - .await - } - - /// Open a PTY shell. Sends the attach request, reads the ack, returns - /// the raw stream for byte bridging to a WebSocket. - pub async fn attach(&self, cmd: &[String]) -> Result { - let mut stream = UnixStream::connect(&self.path) - .await - .map_err(|e| Error::Upstream(format!("EE attach {}: {e}", self.path)))?; - - let mut req = serde_json::json!({"method": "attach", "cmd": cmd}); - if let Some(token) = &self.token { - req["token"] = serde_json::Value::String(token.clone()); - } - let mut buf = serde_json::to_vec(&req)?; - buf.push(b'\n'); - stream.write_all(&buf).await?; - - // Read one line (the ack) without buffering — next bytes belong to the caller. - let mut line = Vec::new(); - let mut byte = [0u8; 1]; - loop { - match stream.read(&mut byte).await? { - 0 => return Err(Error::Upstream("EE attach: closed before ack".into())), - _ if byte[0] == b'\n' => break, - _ if line.len() > 4096 => { - return Err(Error::Upstream("EE attach: ack too long".into())) - } - _ => line.push(byte[0]), - } - } - let ack: serde_json::Value = serde_json::from_slice(&line)?; - if ack["ok"].as_bool() != Some(true) { - return Err(Error::Upstream(format!("EE attach rejected: {ack}"))); - } - Ok(stream) - } -} diff --git a/src/error.rs b/src/error.rs deleted file mode 100644 index 1c1e63d..0000000 --- a/src/error.rs +++ /dev/null @@ -1,54 +0,0 @@ -use axum::http::StatusCode; -use axum::response::IntoResponse; - -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error("bad request: {0}")] - BadRequest(String), - - #[error("unauthorized")] - Unauthorized, - - #[error("not found")] - NotFound, - - #[error("upstream: {0}")] - Upstream(String), - - #[error("internal: {0}")] - Internal(String), -} - -pub type Result = std::result::Result; - -impl IntoResponse for Error { - fn into_response(self) -> axum::response::Response { - let (status, code) = match &self { - Error::BadRequest(_) => (StatusCode::BAD_REQUEST, "BAD_REQUEST"), - Error::Unauthorized => (StatusCode::UNAUTHORIZED, "UNAUTHORIZED"), - Error::NotFound => (StatusCode::NOT_FOUND, "NOT_FOUND"), - Error::Upstream(_) => (StatusCode::BAD_GATEWAY, "UPSTREAM"), - Error::Internal(_) => (StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL"), - }; - let body = axum::Json(serde_json::json!({"code": code, "message": self.to_string()})); - (status, body).into_response() - } -} - -impl From for Error { - fn from(e: reqwest::Error) -> Self { - Error::Upstream(e.to_string()) - } -} - -impl From for Error { - fn from(e: serde_json::Error) -> Self { - Error::BadRequest(e.to_string()) - } -} - -impl From for Error { - fn from(e: std::io::Error) -> Self { - Error::Internal(e.to_string()) - } -} diff --git a/src/gh_oidc.rs b/src/gh_oidc.rs deleted file mode 100644 index bc414a6..0000000 --- a/src/gh_oidc.rs +++ /dev/null @@ -1,605 +0,0 @@ -//! GitHub Actions OIDC verifier. -//! -//! GitHub Actions mints an OIDC JWT per job (issuer -//! `https://token.actions.githubusercontent.com`, JWKS at -//! `/.well-known/jwks`). A caller passes the token as -//! `Authorization: Bearer `; the agent verifies the signature -//! against the cached JWKS and checks the required claims: -//! -//! - `iss` matches the GitHub issuer -//! - `aud` matches the configured audience (default `dd-agent`) -//! - the principal in `DD_OWNER` / `DD_OWNER_ID` / `DD_OWNER_KIND` -//! matches via [`Principal::matches`] -//! -//! A "principal" is one of three kinds — a GitHub user, a GitHub -//! organization, or a specific repository. User and org are -//! textually identical at the token layer (both produce the same -//! `repository_owner` claim); the kind is carried alongside only so -//! `cf.rs::human_policy` can decide whether to install a -//! `github-organization` CF Access include rule. A repo principal -//! gates on `repository` instead. All three flavors require the -//! corresponding numeric ID claim (`repository_owner_id` or -//! `repository_id`) to match too — name-only matching would let a -//! re-registered deleted login produce accepted tokens. - -use std::collections::HashMap; -use std::sync::Arc; - -use jsonwebtoken::{Algorithm, DecodingKey, Validation}; -use reqwest::Client; -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; - -use crate::error::{Error, Result}; - -const ISSUER: &str = "https://token.actions.githubusercontent.com"; -const JWKS_URL: &str = "https://token.actions.githubusercontent.com/.well-known/jwks"; -const LEEWAY_SECS: u64 = 60; - -const ALLOWED_ALGS: &[Algorithm] = &[ - Algorithm::RS256, - Algorithm::RS384, - Algorithm::RS512, - Algorithm::PS256, - Algorithm::PS384, - Algorithm::PS512, -]; - -#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)] -#[serde(rename_all = "lowercase")] -pub enum PrincipalKind { - User, - Org, - Repo, -} - -impl PrincipalKind { - pub fn as_str(&self) -> &'static str { - match self { - PrincipalKind::User => "user", - PrincipalKind::Org => "org", - PrincipalKind::Repo => "repo", - } - } - - pub fn parse(s: &str) -> Result { - match s { - "user" => Ok(PrincipalKind::User), - "org" => Ok(PrincipalKind::Org), - "repo" => Ok(PrincipalKind::Repo), - other => Err(Error::Internal(format!( - "invalid principal kind {other:?} (expected user|org|repo)" - ))), - } - } -} - -/// One of the three principal kinds the agent's `/deploy` verifier -/// accepts. See module docs for shape. -#[derive(Clone, Debug, Serialize)] -pub struct Principal { - pub name: String, - pub id: u64, - pub kind: PrincipalKind, -} - -impl Principal { - /// Construct a Principal, validating that `name` and `kind` - /// agree on shape (repo kinds need a slash; user/org kinds must - /// not have one) and that `id` is non-zero. - pub fn from_parts(name: String, id: u64, kind: PrincipalKind) -> Result { - let has_slash = name.contains('/'); - let shape_ok = matches!( - (kind, has_slash), - (PrincipalKind::Repo, true) - | (PrincipalKind::User, false) - | (PrincipalKind::Org, false) - ); - if !shape_ok { - return Err(Error::Internal(format!( - "principal shape mismatch: name={name:?} kind={} \ - (kind=repo requires '/', kind=user|org rejects '/')", - kind.as_str() - ))); - } - if name.is_empty() { - return Err(Error::Internal("principal name must be non-empty".into())); - } - if id == 0 { - return Err(Error::Internal( - "principal id must be non-zero (defeats login-squat)".into(), - )); - } - Ok(Self { name, id, kind }) - } - - /// True iff this principal authorizes the bearer of `c`. - /// `kind=user|org` matches on `repository_owner`+`repository_owner_id`. - /// `kind=repo` matches on `repository`+`repository_id`. - /// A token missing the corresponding numeric claim fails — that's - /// the squat-defense. - pub fn matches(&self, c: &Claims) -> bool { - match self.kind { - PrincipalKind::User | PrincipalKind::Org => { - c.repository_owner == self.name - && c.repository_owner_id != 0 - && c.repository_owner_id == self.id - } - PrincipalKind::Repo => { - c.repository == self.name && c.repository_id != 0 && c.repository_id == self.id - } - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct Claims { - pub exp: i64, - pub iat: i64, - pub iss: String, - #[serde(default)] - pub sub: String, - #[serde(default)] - pub repository: String, - #[serde(default)] - pub repository_id: u64, - #[serde(default)] - pub repository_owner: String, - #[serde(default)] - pub repository_owner_id: u64, - #[serde(default)] - pub ref_: String, - #[serde(default)] - pub workflow: String, -} - -pub struct Verifier { - owner: Principal, - audience: String, - http: Client, - keys: RwLock>, -} - -impl Verifier { - pub fn new(owner: Principal, audience: String) -> Arc { - Arc::new(Self { - owner, - audience, - http: Client::new(), - keys: RwLock::new(HashMap::new()), - }) - } - - /// Verify a JWT and require it match the fleet `owner`. Use this - /// for endpoints only the fleet should reach — e.g. `/owner`, - /// which re-assigns an agent to a tenant. - pub async fn verify(&self, token: &str) -> Result { - self.verify_allowing(token, None).await - } - - /// Verify a JWT and accept the caller if it matches EITHER the - /// fleet `owner` OR the passed `extra` principal (typically the - /// agent's runtime `agent_owner`, set by the s12e bot via - /// `POST /owner` when a claim activates). Use this for - /// workload-control endpoints (`/deploy`, `/exec`, `/logs`) that - /// should accept either ops or the active tenant. - pub async fn verify_allowing(&self, token: &str, extra: Option<&Principal>) -> Result { - let claims = self.decode_and_validate(token).await?; - let ok = self.owner.matches(&claims) || extra.is_some_and(|p| p.matches(&claims)); - if !ok { - return Err(Error::Unauthorized); - } - Ok(claims) - } - - /// JWT decode + signature/issuer/audience validation, without - /// any owner check. Extracted so `verify` and `verify_allowing` - /// share identical crypto/claim-parsing behaviour and only - /// differ in the final authorization gate. - async fn decode_and_validate(&self, token: &str) -> Result { - let header = jsonwebtoken::decode_header(token) - .map_err(|e| Error::BadRequest(format!("gh oidc header: {e}")))?; - if !ALLOWED_ALGS.contains(&header.alg) { - return Err(Error::BadRequest(format!( - "gh oidc alg {:?} not allowed", - header.alg - ))); - } - let kid = header - .kid - .ok_or_else(|| Error::BadRequest("gh oidc token missing kid".into()))?; - - let key = match self.lookup(&kid).await { - Some(k) => k, - None => { - self.refresh().await?; - self.lookup(&kid) - .await - .ok_or_else(|| Error::BadRequest(format!("gh oidc kid {kid} not in JWKS")))? - } - }; - - let mut v = Validation::new(header.alg); - v.set_issuer(&[ISSUER]); - v.set_audience(&[self.audience.as_str()]); - v.leeway = LEEWAY_SECS; - v.set_required_spec_claims(&["exp", "iat", "iss", "aud"]); - - let data = jsonwebtoken::decode::(token, &key, &v) - .map_err(|e| Error::BadRequest(format!("gh oidc verify: {e}")))?; - - let raw = data.claims; - Ok(Claims { - exp: raw.get("exp").and_then(|x| x.as_i64()).unwrap_or(0), - iat: raw.get("iat").and_then(|x| x.as_i64()).unwrap_or(0), - iss: raw.get("iss").and_then(|x| x.as_str()).unwrap_or("").into(), - sub: raw.get("sub").and_then(|x| x.as_str()).unwrap_or("").into(), - repository: raw - .get("repository") - .and_then(|x| x.as_str()) - .unwrap_or("") - .into(), - repository_id: raw - .get("repository_id") - .and_then(|x| x.as_u64()) - .unwrap_or(0), - repository_owner: raw - .get("repository_owner") - .and_then(|x| x.as_str()) - .unwrap_or("") - .into(), - repository_owner_id: raw - .get("repository_owner_id") - .and_then(|x| x.as_u64()) - .unwrap_or(0), - ref_: raw.get("ref").and_then(|x| x.as_str()).unwrap_or("").into(), - workflow: raw - .get("workflow") - .and_then(|x| x.as_str()) - .unwrap_or("") - .into(), - }) - } - - async fn lookup(&self, kid: &str) -> Option { - self.keys.read().await.get(kid).cloned() - } - - async fn refresh(&self) -> Result<()> { - let resp = self - .http - .get(JWKS_URL) - .send() - .await - .map_err(|e| Error::Upstream(format!("GH JWKS fetch: {e}")))?; - if !resp.status().is_success() { - return Err(Error::Upstream(format!("GH JWKS: HTTP {}", resp.status()))); - } - let jwks: jsonwebtoken::jwk::JwkSet = resp - .json() - .await - .map_err(|e| Error::Upstream(format!("GH JWKS parse: {e}")))?; - let mut map = HashMap::new(); - for jwk in &jwks.keys { - let kid = match &jwk.common.key_id { - Some(k) => k.clone(), - None => continue, - }; - if let Ok(dk) = DecodingKey::from_jwk(jwk) { - map.insert(kid, dk); - } - } - *self.keys.write().await = map; - Ok(()) - } -} - -#[cfg(test)] -impl Verifier { - /// Test-only: pre-seed the JWKS cache with a single (kid, key) - /// pair and skip the GitHub fetch. Lets unit tests round-trip a - /// synthetic JWT through the same `verify_allowing` path the - /// production code uses. - pub fn for_test( - owner: Principal, - audience: String, - kid: String, - key: DecodingKey, - ) -> Arc { - let mut keys = HashMap::new(); - keys.insert(kid, key); - Arc::new(Self { - owner, - audience, - http: Client::new(), - keys: RwLock::new(keys), - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use jsonwebtoken::{EncodingKey, Header}; - use serde_json::json; - - // Static test RSA-2048 keypair generated once with - // openssl genrsa 2048 | openssl pkcs8 -topk8 -nocrypt - // and the matching public-key SPKI form. No security implication — - // used only inside this test module to round-trip JWTs through - // the verifier without hitting the live GitHub JWKS endpoint. - const TEST_PRIV_PEM: &str = "-----BEGIN PRIVATE KEY----- -MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQDHX2vIc1wz+QRm -bB+Sus1FVeam4/XUJ+IiOFDvAQADa4rK/o16DdQUqSzdqDmML4HjTfj7Kz5z4kC5 -+uAMvijEtBYcnURN+RC70GtvIRU3qoQLzt1QiNE/Hwm4yB+oh9DTcEdTZbwT19jd -FfQCsz9gmqujkIdnbQTv0bg+xqTts/hQLDL7ki8hrRq+mO17HLxTUWVQm0HEi9S9 -Bdm1vpTWazM2FN92j1xy7vhb05c0wNILIvix+aMkoencTId3q7lItFe2VBlur3RY -TOirBmFNlNzpMDU7c8BbSzEEKweqWpiKUzQmzUSTbRZdgsuJnFtmQ7KNqTuzseOK -IT8HQk8XAgMBAAECggEAMqt9qSQoesz+4Uj5fUEcilKanC+zeofoYOoPJ68JYdUj -IRQwwKRjEh0s2ei3N3mbeTmH3c3PwYPvD1VDO/nYQqXCOON/SJHUPudpZoTx74PW -q2mXtbAP/grVXbD+2sYpvJL8jaV9d02UQBwkN8t2gAbPOHKy9wYuCwUx3kJ+CCsR -zjsABQto07Pbcg0t7XUpPyQ7zYwUuGbPzQP0Hm5NvR/PK7WGxOXLZYb2EPXzU5kw -/oUIFIHjoI3njoAxoDACJ4r/OpC6vt2lct0ffQySbfFpaFJmSp5QLIXs+CiZfwgV -+XnbeuykuSq5SRAbPbveyMWXZDAqa1bIDponLlaagQKBgQD3uPnNm5wiLJH++5/D -mOH+d9gkFk/7CPtgYjT41r1bKmzRkglUNk7xsOFmAJpkaP3VS1KI2iKXbL7MMA7F -VTt/cX+t/fnTatkGOdgaVP//CP+qlAVHXmM/rKSsWmfHq4qtcm5Tfb09jJhnD2b7 -l/bWN7J9UHAXfDdqc6E5McwSqwKBgQDOCNwbG2uhwEjct0hZbZ39POVKTJ3epAZe -aMaITeBUywRXIp7PdK9SPPm4Wg4Y0+4yu4p+so2jKnYMwoAnclYfaVMNeiLcRtgV -06hpmXeDjRHDhS+fdjNBzoamIykIyuY4SZJrSMi3gY1Uv9L0X3ZbjO6ncL/s9FMS -8w3UByvVRQKBgHjJTawuKrQTDWDJqf3CRrdAEjiOVJMvrvoxCGkos42HIyYQUdIo -5Nc+CrkkpCM/ej0NDAJEckdpM6L1783SID+kxL++rZijaYx6md9FAMmGxrqSj/xb -joMWl/id4Cpgfy7RM/AryCEBs7HUtb8JOsb6w2IM3Yrl+1NBbCQqHrofAoGAF767 -p0AcwnKHszBIXU4d1C6tekekNiGPPlgy8UiQXxVatbQeu2gGQKMYYJ+4WjIqlJw6 -lOl9G13sZwIPhPxPYqVf1gDKfbqIctOG6Eywkm+yqWbzGxyjQaVMrV8F/qZrq/cG -seicgVGj+S12YYWS/XAbnR6IcAWkgV2TrWj6K0UCgYEArY8dMRksLbzwE8eHT2rQ -95ERpsq0oc84O+GDW/1n0AHiEDNwKn+WPwrLOgss4nR7P7WxdAHGvyc5CP95XWx4 -AzloIKJp4ZseC5ai6mcyQLbG6cWPvGAfJdbjWXnyqlhwtl5nWObQ+OKfehuOU8Iy -JQyuinyZVi+WSfcc0EaVlOU= ------END PRIVATE KEY----- -"; - const TEST_PUB_PEM: &str = "-----BEGIN PUBLIC KEY----- -MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAx19ryHNcM/kEZmwfkrrN -RVXmpuP11CfiIjhQ7wEAA2uKyv6Neg3UFKks3ag5jC+B4034+ys+c+JAufrgDL4o -xLQWHJ1ETfkQu9BrbyEVN6qEC87dUIjRPx8JuMgfqIfQ03BHU2W8E9fY3RX0ArM/ -YJqro5CHZ20E79G4Psak7bP4UCwy+5IvIa0avpjtexy8U1FlUJtBxIvUvQXZtb6U -1mszNhTfdo9ccu74W9OXNMDSCyL4sfmjJKHp3EyHd6u5SLRXtlQZbq90WEzoqwZh -TZTc6TA1O3PAW0sxBCsHqlqYilM0Js1Ek20WXYLLiZxbZkOyjak7s7HjiiE/B0JP -FwIDAQAB ------END PUBLIC KEY----- -"; - const TEST_KID: &str = "test-kid"; - - fn enc_key() -> EncodingKey { - EncodingKey::from_rsa_pem(TEST_PRIV_PEM.as_bytes()).expect("test priv pem") - } - - fn dec_key() -> DecodingKey { - DecodingKey::from_rsa_pem(TEST_PUB_PEM.as_bytes()).expect("test pub pem") - } - - fn now() -> i64 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .as_secs() as i64 - } - - /// Construct a synthetic Actions OIDC JWT signed with the test key. - /// `extra` is merged into the payload after the standard claims so - /// individual tests can override or add claims. None values delete. - fn mint(extra: serde_json::Value) -> String { - let mut header = Header::new(Algorithm::RS256); - header.kid = Some(TEST_KID.into()); - let mut payload = json!({ - "iss": ISSUER, - "aud": "dd-agent", - "exp": now() + 600, - "iat": now(), - "sub": "repo:posix4e/dd-hyperliquid-recorder-example:ref:refs/heads/main", - }); - if let (Some(p), Some(e)) = (payload.as_object_mut(), extra.as_object()) { - for (k, v) in e { - if v.is_null() { - p.remove(k); - } else { - p.insert(k.clone(), v.clone()); - } - } - } - jsonwebtoken::encode(&header, &payload, &enc_key()).expect("sign") - } - - fn verifier(owner: Principal) -> Arc { - Verifier::for_test(owner, "dd-agent".into(), TEST_KID.into(), dec_key()) - } - - fn org() -> Principal { - Principal::from_parts("devopsdefender".into(), 67890123, PrincipalKind::Org).unwrap() - } - - fn user() -> Principal { - Principal::from_parts("posix4e".into(), 12345678, PrincipalKind::User).unwrap() - } - - fn repo() -> Principal { - Principal::from_parts( - "posix4e/dd-hyperliquid-recorder-example".into(), - 884121234, - PrincipalKind::Repo, - ) - .unwrap() - } - - // ── alg-rejection (preserved from before the refactor) ───────── - - #[tokio::test] - async fn reject_alg_none() { - let token = "eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.e30."; - let v = verifier(org()); - let err = v.verify(token).await.unwrap_err(); - assert!(matches!(err, Error::BadRequest(_))); - } - - #[tokio::test] - async fn reject_hs256() { - let token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.e30.sig"; - let v = verifier(org()); - let err = v.verify(token).await.unwrap_err(); - match err { - Error::BadRequest(m) => assert!(m.contains("alg")), - e => panic!("expected BadRequest, got {e:?}"), - } - } - - #[tokio::test] - async fn verify_allowing_rejects_bad_alg() { - let token = "eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.e30."; - let v = verifier(org()); - let extra = user(); - let err = v.verify_allowing(token, Some(&extra)).await.unwrap_err(); - assert!(matches!(err, Error::BadRequest(_))); - } - - // ── from_parts shape consistency ────────────────────────────── - - #[test] - fn from_parts_rejects_repo_without_slash() { - let err = Principal::from_parts("nopath".into(), 1, PrincipalKind::Repo).unwrap_err(); - assert!(matches!(err, Error::Internal(_))); - } - - #[test] - fn from_parts_rejects_user_with_slash() { - let err = Principal::from_parts("a/b".into(), 1, PrincipalKind::User).unwrap_err(); - assert!(matches!(err, Error::Internal(_))); - } - - #[test] - fn from_parts_rejects_org_with_slash() { - let err = Principal::from_parts("a/b".into(), 1, PrincipalKind::Org).unwrap_err(); - assert!(matches!(err, Error::Internal(_))); - } - - #[test] - fn from_parts_rejects_zero_id() { - let err = Principal::from_parts("name".into(), 0, PrincipalKind::User).unwrap_err(); - assert!(matches!(err, Error::Internal(_))); - } - - #[test] - fn from_parts_rejects_empty_name() { - let err = Principal::from_parts("".into(), 1, PrincipalKind::User).unwrap_err(); - assert!(matches!(err, Error::Internal(_))); - } - - // ── positive paths ──────────────────────────────────────────── - - #[tokio::test] - async fn org_kind_accepts_matching_token() { - let token = mint(json!({ - "repository": "devopsdefender/anything", - "repository_id": 999_001u64, - "repository_owner": "devopsdefender", - "repository_owner_id": 67890123u64, - })); - let claims = verifier(org()).verify(&token).await.unwrap(); - assert_eq!(claims.repository_owner, "devopsdefender"); - } - - #[tokio::test] - async fn user_kind_accepts_matching_token() { - let token = mint(json!({ - "repository": "posix4e/dd-hyperliquid-recorder-example", - "repository_id": 884121234u64, - "repository_owner": "posix4e", - "repository_owner_id": 12345678u64, - })); - let claims = verifier(user()).verify(&token).await.unwrap(); - assert_eq!(claims.repository_owner, "posix4e"); - } - - #[tokio::test] - async fn repo_kind_accepts_matching_token() { - let token = mint(json!({ - "repository": "posix4e/dd-hyperliquid-recorder-example", - "repository_id": 884121234u64, - "repository_owner": "posix4e", - "repository_owner_id": 12345678u64, - })); - let claims = verifier(repo()).verify(&token).await.unwrap(); - assert_eq!(claims.repository, "posix4e/dd-hyperliquid-recorder-example"); - } - - // ── squat-defense / shape-mismatch rejections ───────────────── - - #[tokio::test] - async fn org_kind_rejects_wrong_owner_id() { - // Right name, wrong ID — exactly the squat-attack shape. - let token = mint(json!({ - "repository_owner": "devopsdefender", - "repository_owner_id": 99999999u64, - })); - let err = verifier(org()).verify(&token).await.unwrap_err(); - assert!(matches!(err, Error::Unauthorized)); - } - - #[tokio::test] - async fn org_kind_rejects_missing_owner_id() { - let token = mint(json!({ - "repository_owner": "devopsdefender", - "repository_owner_id": null, - })); - let err = verifier(org()).verify(&token).await.unwrap_err(); - assert!(matches!(err, Error::Unauthorized)); - } - - #[tokio::test] - async fn org_kind_rejects_wrong_owner_name() { - let token = mint(json!({ - "repository_owner": "someone-else", - "repository_owner_id": 67890123u64, - })); - let err = verifier(org()).verify(&token).await.unwrap_err(); - assert!(matches!(err, Error::Unauthorized)); - } - - #[tokio::test] - async fn repo_kind_rejects_owner_only_token() { - // Verifier wants repository_id; token has a matching owner - // but a different repository_id (= a different repo under - // the same owner). Strictness is the point. - let token = mint(json!({ - "repository": "posix4e/some-other-repo", - "repository_id": 222222u64, - "repository_owner": "posix4e", - "repository_owner_id": 12345678u64, - })); - let err = verifier(repo()).verify(&token).await.unwrap_err(); - assert!(matches!(err, Error::Unauthorized)); - } - - // ── extra principal ─────────────────────────────────────────── - - #[tokio::test] - async fn extra_principal_accepts_when_fleet_rejects() { - let token = mint(json!({ - "repository_owner": "posix4e", - "repository_owner_id": 12345678u64, - })); - // Fleet is an org; token is a user — but extra=user matches. - let v = verifier(org()); - let claims = v.verify_allowing(&token, Some(&user())).await.unwrap(); - assert_eq!(claims.repository_owner, "posix4e"); - } - - #[tokio::test] - async fn extra_principal_does_not_relax_squat_defense() { - let token = mint(json!({ - "repository_owner": "posix4e", - "repository_owner_id": 99999999u64, - })); - let v = verifier(org()); - let err = v.verify_allowing(&token, Some(&user())).await.unwrap_err(); - assert!(matches!(err, Error::Unauthorized)); - } -} diff --git a/src/html.rs b/src/html.rs deleted file mode 100644 index d5e1aee..0000000 --- a/src/html.rs +++ /dev/null @@ -1,73 +0,0 @@ -//! Shared page shell + CSS + nav for all rendered HTML. - -pub const CSS: &str = r#" -* { box-sizing:border-box; margin:0; padding:0; } -body { background:#1e1e2e; color:#cdd6f4; font-family:'JetBrains Mono',ui-monospace,monospace; } -a { color:#89b4fa; text-decoration:none; } a:hover { text-decoration:underline; } -nav { display:flex; align-items:center; gap:16px; padding:12px 24px; border-bottom:1px solid #313244; } -nav .brand { color:#89b4fa; font-weight:700; font-size:14px; } -nav a { color:#a6adc8; font-size:13px; } nav a:hover, nav a.active { color:#cdd6f4; } -nav .spacer { flex:1; } -main { max-width:1080px; margin:0 auto; padding:24px; } -h1 { color:#89b4fa; font-size:20px; margin-bottom:4px; } -.sub { color:#585b70; font-size:12px; margin-bottom:16px; } -.meta { color:#a6adc8; font-size:13px; margin-bottom:24px; } -.meta .ok { color:#a6e3a1; } -.section { color:#a6adc8; font-size:12px; text-transform:uppercase; margin:20px 0 8px; } -.cards { display:grid; grid-template-columns:repeat(auto-fit,minmax(180px,1fr)); gap:12px; margin-bottom:16px; } -.card { background:#181825; border:1px solid #313244; border-radius:8px; padding:16px; } -.card .label { color:#a6adc8; font-size:11px; text-transform:uppercase; } -.card .value { font-size:20px; margin-top:4px; } -.card .value.green { color:#a6e3a1; } .card .value.blue { color:#89b4fa; } -.card .value.peach { color:#fab387; } .card .value.mauve { color:#cba6f7; } -.row { display:flex; justify-content:space-between; padding:8px 0; border-bottom:1px solid #313244; } -.row:last-child { border-bottom:none; } -table { border-collapse:collapse; width:100%; } -th { text-align:left; color:#a6adc8; font-weight:normal; font-size:12px; text-transform:uppercase; padding:8px 12px; border-bottom:1px solid #313244; } -td { padding:8px 12px; border-bottom:1px solid #313244; font-size:14px; } -.pill { display:inline-block; padding:2px 8px; border-radius:4px; font-size:12px; font-weight:600; } -.pill.healthy, .pill.running { background:#a6e3a122; color:#a6e3a1; } -.pill.stale, .pill.deploying { background:#fab38722; color:#fab387; } -.pill.dead, .pill.failed, .pill.exited { background:#f38ba822; color:#f38ba8; } -.pill.idle { background:#31324488; color:#a6adc8; } -input, button { font-family:inherit; font-size:14px; } -input[type=password], input[type=text] { width:100%; padding:10px 12px; background:#11111b; border:1px solid #313244; border-radius:6px; color:#cdd6f4; outline:none; } -input:focus { border-color:#89b4fa; } -button { padding:10px 16px; background:#89b4fa; color:#1e1e2e; border:none; border-radius:6px; font-weight:600; cursor:pointer; } -button:hover { background:#74c7ec; } -.empty { color:#585b70; padding:24px; text-align:center; } -.dim { color:#585b70; } -.back { font-size:13px; margin-bottom:20px; } -.err { color:#f38ba8; font-size:13px; margin-bottom:12px; } -pre { background:#11111b; border:1px solid #313244; border-radius:8px; padding:16px; overflow:auto; font-size:12px; line-height:1.5; color:#a6adc8; } -code { background:#11111b; padding:2px 6px; border-radius:3px; font-size:12px; } -@media (max-width:640px) { main { padding:16px; } .cards { grid-template-columns:1fr 1fr; } } -"#; - -pub fn shell(title: &str, nav: &str, body: &str) -> String { - format!( - r#" - -{title}{nav}
{body}
"# - ) -} - -pub fn nav(items: &[(&str, &str, bool)]) -> String { - let mut s = String::from(r#""#); - s -} - -pub fn escape(s: &str) -> String { - s.replace('&', "&") - .replace('<', "<") - .replace('>', ">") - .replace('"', """) -} diff --git a/src/ita.rs b/src/ita.rs deleted file mode 100644 index aa4e430..0000000 --- a/src/ita.rs +++ /dev/null @@ -1,253 +0,0 @@ -//! Intel Trust Authority (ITA) — mint + verify. -//! -//! The agent fetches a raw TDX quote from easyenclave, POSTs it to -//! `api.trustauthority.intel.com/appraisal/v1/attest` with an x-api-key -//! header, and receives a signed JWT ("ITA token"). That JWT is -//! forwarded in the register payload. The CP verifies the signature -//! against Intel's JWKS, checks issuer + exp + algorithm allowlist, -//! and stores the decoded claims on the agent record. -//! -//! Fail open: if an agent has no `DD_ITA_API_KEY` it registers without -//! a token. The CP accepts unsigned registrations unless `DD_ITA_REQUIRED=true`. - -use std::collections::HashMap; -use std::sync::Arc; - -use jsonwebtoken::{Algorithm, DecodingKey, Validation}; -use reqwest::Client; -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; - -use crate::error::{Error, Result}; - -/// Algorithms we accept on ITA-issued tokens. Explicitly excludes -/// `HS*` (symmetric — can't verify against a JWKS) and `none`. -const ALLOWED_ALGS: &[Algorithm] = &[ - Algorithm::RS256, - Algorithm::RS384, - Algorithm::RS512, - Algorithm::ES256, - Algorithm::ES384, - Algorithm::PS256, - Algorithm::PS384, - Algorithm::PS512, - Algorithm::EdDSA, -]; - -const LEEWAY_SECS: u64 = 120; - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct Claims { - pub exp: i64, - pub iat: i64, - #[serde(default)] - pub tcb_status: Option, - #[serde(default)] - pub attester_type: Option, - #[serde(default)] - pub mrtd: Option, - #[serde(default)] - pub mrsigner: Option, - #[serde(default)] - pub report_data: Option, - /// Full body, preserved so the dashboard can show anything the - /// typed fields above don't cover. - #[serde(default)] - pub extra: serde_json::Value, -} - -impl Claims { - fn from_value(v: serde_json::Value) -> Self { - // Map Intel's wire names to our normalized ones. - let get = |k: &str| v.get(k).and_then(|x| x.as_str()).map(String::from); - Self { - exp: v.get("exp").and_then(|x| x.as_i64()).unwrap_or(0), - iat: v.get("iat").and_then(|x| x.as_i64()).unwrap_or(0), - tcb_status: get("attester_tcb_status"), - attester_type: get("attester_type"), - mrtd: get("tdx_mrtd"), - mrsigner: get("tdx_mrsigner"), - report_data: get("attester_held_data"), - extra: v, - } - } -} - -// ── Minter ────────────────────────────────────────────────────────────── - -#[derive(Serialize)] -struct MintRequest<'a> { - quote: &'a str, -} - -#[derive(Deserialize)] -struct MintResponse { - token: String, -} - -/// POST a base64 TDX quote to Intel, receive a signed JWT. `base_url` -/// is typically `https://api.trustauthority.intel.com`. -pub async fn mint(base_url: &str, api_key: &str, quote_b64: &str) -> Result { - let url = format!("{}/appraisal/v1/attest", base_url.trim_end_matches('/')); - let resp = Client::new() - .post(&url) - .header("x-api-key", api_key) - .header("Accept", "application/json") - .json(&MintRequest { quote: quote_b64 }) - .send() - .await - .map_err(|e| Error::Upstream(format!("ITA mint {url}: {e}")))?; - let status = resp.status(); - if !status.is_success() { - let body = resp.text().await.unwrap_or_default(); - return Err(Error::Upstream(format!("ITA mint {status}: {body}"))); - } - let body: MintResponse = resp.json().await?; - Ok(body.token) -} - -// ── Verifier ──────────────────────────────────────────────────────────── - -/// Caches the JWKS in memory. Refreshes on unknown `kid`; otherwise -/// serves from cache indefinitely (Intel rotates rarely). -pub struct Verifier { - jwks_url: String, - issuer: String, - http: Client, - keys: RwLock>, -} - -impl Verifier { - pub fn new(jwks_url: String, issuer: String) -> Arc { - Arc::new(Self { - jwks_url, - issuer, - http: Client::new(), - keys: RwLock::new(HashMap::new()), - }) - } - - /// Verify a JWT. Returns decoded claims on success. - pub async fn verify(&self, token: &str) -> Result { - let header = jsonwebtoken::decode_header(token) - .map_err(|e| Error::BadRequest(format!("ita header: {e}")))?; - if !ALLOWED_ALGS.contains(&header.alg) { - return Err(Error::BadRequest(format!( - "ita alg {:?} not allowed", - header.alg - ))); - } - let kid = header - .kid - .ok_or_else(|| Error::BadRequest("ita token missing kid".into()))?; - - let key = match self.lookup(&kid).await { - Some(k) => k, - None => { - self.refresh().await?; - self.lookup(&kid) - .await - .ok_or_else(|| Error::BadRequest(format!("ita kid {kid} not in JWKS")))? - } - }; - - let mut v = Validation::new(header.alg); - v.set_issuer(&[&self.issuer]); - v.leeway = LEEWAY_SECS; - v.set_required_spec_claims(&["exp", "iat", "iss"]); - - let data = jsonwebtoken::decode::(token, &key, &v) - .map_err(|e| Error::BadRequest(format!("ita verify: {e}")))?; - Ok(Claims::from_value(data.claims)) - } - - async fn lookup(&self, kid: &str) -> Option { - // DecodingKey isn't Clone, so we can't return a reference to a - // cached entry outliving the lock. Re-derive from the cached - // JWK instead — we store the raw JWK bytes and reconstruct. - self.keys.read().await.get(kid).cloned() - } - - async fn refresh(&self) -> Result<()> { - let resp = self - .http - .get(&self.jwks_url) - .send() - .await - .map_err(|e| Error::Upstream(format!("JWKS fetch {}: {e}", self.jwks_url)))?; - if !resp.status().is_success() { - return Err(Error::Upstream(format!( - "JWKS {}: HTTP {}", - self.jwks_url, - resp.status() - ))); - } - let jwks: jsonwebtoken::jwk::JwkSet = resp - .json() - .await - .map_err(|e| Error::Upstream(format!("JWKS parse: {e}")))?; - - let mut map = HashMap::new(); - for jwk in &jwks.keys { - let kid = match &jwk.common.key_id { - Some(k) => k.clone(), - None => continue, - }; - if let Ok(dk) = DecodingKey::from_jwk(jwk) { - map.insert(kid, dk); - } - } - *self.keys.write().await = map; - Ok(()) - } -} - -// ── Convenience for DecodingKey cloning ───────────────────────────────── -// jsonwebtoken::DecodingKey is Clone in 9.x, so the HashMap -// above works directly. (Left as a comment so we notice if that changes.) - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn reject_algorithm_none() { - // Header {"alg":"none"}, empty sig. Base64url("{\"alg\":\"none\",\"typ\":\"JWT\"}") - // . Base64url("{}") . "". - let token = "eyJhbGciOiJub25lIiwidHlwIjoiSldUIn0.e30."; - let v = Verifier::new("http://127.0.0.1:1/".into(), "x".into()); - let err = v.verify(token).await.unwrap_err(); - assert!(matches!(err, Error::BadRequest(_)), "got {err:?}"); - } - - #[tokio::test] - async fn reject_hs256() { - // alg=HS256 — not in our allowlist, must fail on alg check alone. - let token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.e30.signature_placeholder"; - let v = Verifier::new("http://127.0.0.1:1/".into(), "x".into()); - let err = v.verify(token).await.unwrap_err(); - match err { - Error::BadRequest(m) => assert!(m.contains("alg"), "msg: {m}"), - e => panic!("expected BadRequest, got {e:?}"), - } - } - - #[test] - fn claims_map_intel_wire_names() { - let v = serde_json::json!({ - "exp": 123, "iat": 1, - "attester_tcb_status": "UpToDate", - "attester_type": "TDX", - "tdx_mrtd": "aa", - "tdx_mrsigner": "bb", - "attester_held_data": "cc", - }); - let c = Claims::from_value(v.clone()); - assert_eq!(c.exp, 123); - assert_eq!(c.tcb_status.as_deref(), Some("UpToDate")); - assert_eq!(c.mrtd.as_deref(), Some("aa")); - assert_eq!(c.mrsigner.as_deref(), Some("bb")); - assert_eq!(c.report_data.as_deref(), Some("cc")); - assert_eq!(c.extra, v); - } -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index e40598e..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,15 +0,0 @@ -pub mod agent; -pub mod cf; -pub mod collector; -pub mod config; -pub mod cp; -pub mod devices; -pub mod ee; -pub mod error; -pub mod gh_oidc; -pub mod html; -pub mod ita; -pub mod metrics; -pub mod noise_gateway; -pub mod stonith; -pub mod taint; diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 6618074..0000000 --- a/src/main.rs +++ /dev/null @@ -1,30 +0,0 @@ -//! devopsdefender — unified binary. -//! -//! DD_MODE=cp devopsdefender # control-plane -//! DD_MODE=agent devopsdefender # in-VM agent -//! -//! (Also accepts `devopsdefender cp` / `devopsdefender agent` for local dev.) - -use devopsdefender::{agent, cp}; - -#[tokio::main] -async fn main() { - let mode = std::env::var("DD_MODE") - .ok() - .or_else(|| std::env::args().nth(1).filter(|s| !s.starts_with('-'))); - - let result = match mode.as_deref() { - Some("cp") | Some("management") => cp::run().await, - Some("agent") => agent::run().await, - _ => { - eprintln!("usage: devopsdefender "); - eprintln!(" or: DD_MODE= devopsdefender"); - std::process::exit(2); - } - }; - - if let Err(e) = result { - eprintln!("devopsdefender: fatal: {e}"); - std::process::exit(1); - } -} diff --git a/src/metrics.rs b/src/metrics.rs deleted file mode 100644 index ad1b7a8..0000000 --- a/src/metrics.rs +++ /dev/null @@ -1,206 +0,0 @@ -//! System metrics for the dashboard and `/health` JSON. Most of the -//! work is done by the `sysinfo` crate — we just project its data -//! into our stable wire shape. CPU utilization still comes from -//! `/proc/stat` since `sysinfo` requires a 200 ms sample to report -//! non-zero CPU, and we don't want that delay on every request. - -use serde::{Deserialize, Serialize}; -use sysinfo::{Disks, Networks, System}; - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct DiskStats { - pub mount: String, - pub fstype: String, - pub used_bytes: u64, - pub total_bytes: u64, -} - -#[derive(Debug, Clone, Serialize, Deserialize, Default)] -pub struct NetStats { - pub iface: String, - pub rx_bytes: u64, - pub tx_bytes: u64, -} - -#[derive(Debug, Clone, Serialize, Default)] -pub struct SysMetrics { - pub cpu_pct: u64, - pub mem_used_mb: u64, - pub mem_total_mb: u64, - /// Swap used / total (0 if the VM has no swap). - pub swap_used_mb: u64, - pub swap_total_mb: u64, - pub load_1m: f64, - pub load_5m: f64, - pub load_15m: f64, - /// System uptime in seconds (how long the VM has been booted). - pub uptime_secs: u64, - /// Per-interface RX/TX byte counters (excludes `lo`). - pub nets: Vec, - /// Per-mount capacity stats (excludes pseudo-filesystems). - pub disks: Vec, -} - -pub async fn collect() -> SysMetrics { - let cpu_pct = tokio::fs::read_to_string("/proc/stat") - .await - .ok() - .and_then(|s| cpu_pct_from_stat(&s)) - .unwrap_or(0); - - // sysinfo's API is sync and does blocking I/O; hop off the - // reactor thread so long /proc walks don't stall the server. - tokio::task::spawn_blocking(move || { - let mut sys = System::new(); - sys.refresh_memory(); - - let load = System::load_average(); - let uptime_secs = System::uptime(); - - let mem_total_mb = sys.total_memory() / 1024 / 1024; - let mem_used_mb = sys.used_memory() / 1024 / 1024; - let swap_total_mb = sys.total_swap() / 1024 / 1024; - let swap_used_mb = sys.used_swap() / 1024 / 1024; - - let nets = Networks::new_with_refreshed_list() - .iter() - .filter(|(name, _)| *name != "lo") - .map(|(name, data)| NetStats { - iface: name.to_string(), - rx_bytes: data.total_received(), - tx_bytes: data.total_transmitted(), - }) - .collect(); - - let mut seen = std::collections::HashSet::new(); - let disks = Disks::new_with_refreshed_list() - .iter() - .filter_map(|d| { - let mount = d.mount_point().to_string_lossy().into_owned(); - if !seen.insert(mount.clone()) { - return None; - } - let total = d.total_space(); - if total == 0 { - return None; - } - Some(DiskStats { - mount, - fstype: d.file_system().to_string_lossy().into_owned(), - total_bytes: total, - used_bytes: total.saturating_sub(d.available_space()), - }) - }) - .collect(); - - SysMetrics { - cpu_pct, - mem_total_mb, - mem_used_mb, - swap_total_mb, - swap_used_mb, - load_1m: load.one, - load_5m: load.five, - load_15m: load.fifteen, - uptime_secs, - nets, - disks, - } - }) - .await - .unwrap_or_default() -} - -/// One-shot CPU utilization from `/proc/stat`'s aggregate counters: -/// `(total - idle) / total` over the lifetime of the kernel. Coarse -/// (it's an average, not an instantaneous reading) but doesn't -/// require a two-sample delta like sysinfo's CPU, and matches the -/// historical shape the dashboard has been rendering. -fn cpu_pct_from_stat(stat: &str) -> Option { - let line = stat.lines().next()?; - let vals: Vec = line - .split_whitespace() - .skip(1) - .filter_map(|v| v.parse().ok()) - .collect(); - if vals.len() < 4 { - return None; - } - let total: u64 = vals.iter().sum(); - let idle = vals[3]; - let idle_pct = (idle.saturating_mul(100)).checked_div(total)?; - Some(100u64.saturating_sub(idle_pct)) -} - -pub fn format_bytes_mb(mb: u64) -> String { - if mb >= 1024 { - format!("{:.1}G", mb as f64 / 1024.0) - } else { - format!("{mb}M") - } -} - -/// Humanise a raw byte count as K/M/G/T. Used for network counters. -pub fn format_bytes_si(b: u64) -> String { - const K: u64 = 1024; - const M: u64 = 1024 * K; - const G: u64 = 1024 * M; - const T: u64 = 1024 * G; - if b >= T { - format!("{:.1}T", b as f64 / T as f64) - } else if b >= G { - format!("{:.1}G", b as f64 / G as f64) - } else if b >= M { - format!("{:.1}M", b as f64 / M as f64) - } else if b >= K { - format!("{:.1}K", b as f64 / K as f64) - } else { - format!("{b}B") - } -} - -pub fn format_duration_secs(s: u64) -> String { - if s >= 86400 { - format!("{}d {}h", s / 86400, (s % 86400) / 3600) - } else if s >= 3600 { - format!("{}h {}m", s / 3600, (s % 3600) / 60) - } else if s >= 60 { - format!("{}m", s / 60) - } else { - format!("{s}s") - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn cpu_pct_computes_from_stat() { - // user=100 nice=0 system=50 idle=850 … → (150/1000)=15% used. - let stat = "cpu 100 0 50 850 0 0 0 0 0 0\ncpu0 50 0 25 425 0 0 0 0 0 0"; - assert_eq!(cpu_pct_from_stat(stat), Some(15)); - } - - #[test] - fn cpu_pct_handles_zero_total() { - assert_eq!(cpu_pct_from_stat("cpu 0 0 0 0"), None); - } - - #[test] - fn format_bytes_si_boundaries() { - assert_eq!(format_bytes_si(0), "0B"); - assert_eq!(format_bytes_si(1023), "1023B"); - assert_eq!(format_bytes_si(1024), "1.0K"); - assert_eq!(format_bytes_si(1024 * 1024), "1.0M"); - assert_eq!(format_bytes_si(1024u64.pow(3)), "1.0G"); - } - - #[test] - fn format_duration_shapes() { - assert_eq!(format_duration_secs(45), "45s"); - assert_eq!(format_duration_secs(3 * 60), "3m"); - assert_eq!(format_duration_secs(2 * 3600 + 30 * 60), "2h 30m"); - assert_eq!(format_duration_secs(3 * 86400 + 5 * 3600), "3d 5h"); - } -} diff --git a/src/noise_gateway/allowlist.rs b/src/noise_gateway/allowlist.rs deleted file mode 100644 index 816607b..0000000 --- a/src/noise_gateway/allowlist.rs +++ /dev/null @@ -1,90 +0,0 @@ -//! Method allowlist for EE agent-socket RPCs. -//! -//! Only methods that are safe to expose to an external device are let -//! through. `deploy` in particular stays internal — enclave workload -//! topology is managed by DD's CP + agent, not by end-user CLIs. - -use serde::{Deserialize, Serialize}; - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum Method { - Attest, - Attach, - Exec, - Health, - List, - Logs, -} - -impl Method { - pub fn as_str(self) -> &'static str { - match self { - Self::Attest => "attest", - Self::Attach => "attach", - Self::Exec => "exec", - Self::Health => "health", - Self::List => "list", - Self::Logs => "logs", - } - } -} - -/// Deserialize just the `method` field from a request envelope and -/// match it against the allowlist. Returns `Err` for unknown or -/// disallowed methods. -pub fn classify(raw: &serde_json::Value) -> Result { - let method = raw - .get("method") - .and_then(|v| v.as_str()) - .ok_or(ClassifyError::Missing)?; - match method { - "attest" => Ok(Method::Attest), - "attach" => Ok(Method::Attach), - "exec" => Ok(Method::Exec), - "health" => Ok(Method::Health), - "list" => Ok(Method::List), - "logs" => Ok(Method::Logs), - "deploy" => Err(ClassifyError::Disallowed("deploy".into())), - other => Err(ClassifyError::Unknown(other.into())), - } -} - -#[derive(Debug, thiserror::Error)] -pub enum ClassifyError { - #[error("request envelope missing `method` field")] - Missing, - #[error("method `{0}` is not in the allowlist")] - Disallowed(String), - #[error("unknown method `{0}`")] - Unknown(String), -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn deploy_blocked() { - let r = classify(&serde_json::json!({"method": "deploy"})); - assert!(matches!(r, Err(ClassifyError::Disallowed(_)))); - } - - #[test] - fn exec_allowed() { - let r = classify(&serde_json::json!({"method": "exec", "argv": ["ls"]})); - assert_eq!(r.unwrap(), Method::Exec); - } - - #[test] - fn unknown_rejected() { - let r = classify(&serde_json::json!({"method": "steal"})); - assert!(matches!(r, Err(ClassifyError::Unknown(_)))); - } - - #[test] - fn missing_rejected() { - let r = classify(&serde_json::json!({})); - assert!(matches!(r, Err(ClassifyError::Missing))); - } -} diff --git a/src/noise_gateway/attest.rs b/src/noise_gateway/attest.rs deleted file mode 100644 index 78956da..0000000 --- a/src/noise_gateway/attest.rs +++ /dev/null @@ -1,159 +0,0 @@ -//! TDX attestation + Noise static keypair. -//! -//! On boot we either load an existing 32-byte X25519 private key from -//! disk (tmpfs — `/run/devopsdefender/noise.key`) or mint a fresh one. -//! The corresponding public key is embedded in a TDX quote's -//! `report_data` field (low 32 bytes; high 32 bytes zero). The quote + -//! pubkey bundle is surfaced by the containing service's `/health` -//! endpoint (see `agent::health` and `cp::health`); clients verify -//! the quote via ITA, extract the Noise static pubkey from -//! `report_data`, and trust that key for the handshake. No X.509 -//! certs in the loop. -//! -//! There used to be a dedicated `GET /attest` route here. It was -//! collapsed into `/health` so a bastion-app bootstrap does one -//! request instead of two, and so the CF Access bypass list shrinks -//! by one app per env × per service. The Noise quote is stable per -//! boot — just an `Arc` clone on each `/health` hit. - -use std::path::{Path, PathBuf}; - -use rand::rngs::OsRng; -use x25519_dalek::{PublicKey, StaticSecret}; - -pub struct Attestor { - secret: StaticSecret, - public: [u8; 32], - quote: Vec, -} - -impl Attestor { - /// Load `key_file` if it exists and is 32 bytes, otherwise mint a - /// fresh keypair and best-effort-persist it with 0600 perms. - /// Persistence is non-fatal: every deploy already rotates the - /// enclave Noise key (fresh VM / fresh boot), so losing the write - /// just means this same VM won't reuse the key across an - /// in-enclave process restart. Then generate a fresh TDX quote - /// binding the public key into `report_data`. - pub async fn load_or_mint(key_file: &Path) -> anyhow::Result { - let secret = match tokio::fs::read(key_file).await { - Ok(bytes) if bytes.len() == 32 => { - let mut k = [0u8; 32]; - k.copy_from_slice(&bytes); - StaticSecret::from(k) - } - _ => { - let fresh = StaticSecret::random_from_rng(OsRng); - if let Err(e) = persist_key(key_file, fresh.as_bytes()).await { - eprintln!( - "noise-gw: persist {} failed ({e}); continuing with in-memory key", - key_file.display() - ); - } - fresh - } - }; - - let public = PublicKey::from(&secret).to_bytes(); - let quote = tdx_quote(&public)?; - - Ok(Self { - secret, - public, - quote, - }) - } - - pub fn public_key(&self) -> &[u8; 32] { - &self.public - } - - pub fn secret(&self) -> &StaticSecret { - &self.secret - } - - pub fn quote(&self) -> &[u8] { - &self.quote - } -} - -async fn persist_key(key_file: &Path, bytes: &[u8; 32]) -> anyhow::Result<()> { - if let Some(parent) = key_file.parent() { - tokio::fs::create_dir_all(parent).await.ok(); - } - let tmp: PathBuf = key_file.with_extension("key.tmp"); - tokio::fs::write(&tmp, bytes).await?; - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - let perms = std::fs::Permissions::from_mode(0o600); - tokio::fs::set_permissions(&tmp, perms).await?; - } - tokio::fs::rename(&tmp, key_file).await?; - Ok(()) -} - -/// Generate a TDX quote over `report_data` derived from the given -/// X25519 pubkey. Drives `configfs-tsm` when available (Linux + TDX -/// kernel); otherwise returns a placeholder and logs a warning. The -/// graceful fallback is what lets `cargo test` and dev runs on -/// non-TDX hosts succeed — the placeholder quote will fail ITA -/// verification, which is exactly what you want off-enclave. -fn tdx_quote(pubkey: &[u8; 32]) -> anyhow::Result> { - match try_configfs_tsm_quote(pubkey) { - Ok(q) => Ok(q), - Err(e) => { - eprintln!( - "noise-gw: configfs-tsm unavailable ({e}); using placeholder quote. \ - Clients will fail ITA verification — this is expected off-enclave." - ); - Ok(b"noise-gw-placeholder-quote".to_vec()) - } - } -} - -#[cfg(target_os = "linux")] -fn try_configfs_tsm_quote(pubkey: &[u8; 32]) -> anyhow::Result> { - use std::fs; - use std::io::Write; - - let base = std::path::Path::new("/sys/kernel/config/tsm/report"); - if !base.exists() { - anyhow::bail!("{} not present", base.display()); - } - - let mut report_data = [0u8; 64]; - report_data[..32].copy_from_slice(pubkey); - - let dir = base.join("devopsdefender"); - fs::create_dir_all(&dir)?; - { - let mut inblob = fs::OpenOptions::new() - .write(true) - .open(dir.join("inblob"))?; - inblob.write_all(&report_data)?; - } - let outblob = fs::read(dir.join("outblob"))?; - fs::remove_dir(&dir).ok(); - Ok(outblob) -} - -#[cfg(not(target_os = "linux"))] -fn try_configfs_tsm_quote(_pubkey: &[u8; 32]) -> anyhow::Result> { - anyhow::bail!("configfs-tsm is Linux-only") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn mint_round_trips() { - let dir = tempfile::tempdir().unwrap(); - let kf = dir.path().join("noise.key"); - let a = Attestor::load_or_mint(&kf).await.unwrap(); - let pk = *a.public_key(); - let b = Attestor::load_or_mint(&kf).await.unwrap(); - assert_eq!(&pk, b.public_key()); - } -} diff --git a/src/noise_gateway/mod.rs b/src/noise_gateway/mod.rs deleted file mode 100644 index be37377..0000000 --- a/src/noise_gateway/mod.rs +++ /dev/null @@ -1,52 +0,0 @@ -//! # Noise_IK attested gateway to EE's agent socket. -//! -//! Route exposed alongside DD's normal HTTP surface (same port): -//! - `GET /noise/ws` — WebSocket upgrade; server runs Noise_IK -//! responder. Initiator's static pubkey must be in the local -//! trust set. After the handshake every decrypted binary frame -//! is one JSON request envelope gated by [`allowlist::classify`], -//! forwarded to EE's unix agent socket with the `EE_TOKEN` -//! (when available) injected server-side. -//! -//! The pre-handshake TDX quote + Noise pubkey bundle is served by the -//! containing service's `/health` endpoint, not by this module — see -//! `agent::health` / `cp::health` for `{ noise: { quote_b64, -//! pubkey_hex } }`. Clients do one `/health` fetch, verify the quote -//! against ITA, pin the pubkey from `report_data`, and open this -//! WebSocket. -//! -//! This module used to live in `crates/ee-proxy/`; folded in here -//! so the trust list can be a shared in-memory set (not a file -//! contract) and so the gateway inherits whatever `EE_TOKEN` the -//! main `devopsdefender` process already has. - -pub mod allowlist; -pub mod attest; -pub mod noise; -pub mod upstream; - -use std::collections::HashSet; -use std::sync::Arc; - -use axum::Router; -use tokio::sync::RwLock; - -/// Live set of device pubkeys the local Noise responder will accept. -/// Mutated by `devices::Store` (on the CP) or by the agent's -/// `sync_trusted_devices` poll loop. -pub type TrustHandle = Arc>>; - -pub fn new_trust_handle() -> TrustHandle { - Arc::new(RwLock::new(HashSet::new())) -} - -#[derive(Clone)] -pub struct State { - pub attest: Arc, - pub trust: TrustHandle, - pub upstream: Arc, -} - -pub fn router(state: State) -> Router { - Router::new().merge(noise::routes()).with_state(state) -} diff --git a/src/noise_gateway/noise.rs b/src/noise_gateway/noise.rs deleted file mode 100644 index 2d82f2d..0000000 --- a/src/noise_gateway/noise.rs +++ /dev/null @@ -1,215 +0,0 @@ -//! Noise_IK responder over WebSocket. -//! -//! Wire: -//! 1. Client opens `GET /noise/ws` and upgrades to WebSocket. -//! 2. Client sends the first Noise_IK message (binary WS frame). -//! After reading it we inspect the initiator's static key via -//! `get_remote_static()`. If it isn't in the shared trust set -//! we close the connection. -//! 3. We respond with the second handshake message. -//! 4. Both sides move into transport mode; each subsequent WS -//! binary frame is one Noise transport message carrying a JSON -//! request envelope, gated by [`super::allowlist::classify`] -//! and forwarded to the EE agent socket. -//! -//! `attach` is special: after the one JSON ack frame, the session -//! shifts into a raw bidirectional byte bridge. Client→server -//! frames carry stdin bytes; server→client frames carry stdout/ -//! stderr bytes. Either side closing the WS ends the session. This -//! keeps one Noise session == one PTY, which is fine — a second -//! shell opens a second WS. - -use axum::extract::ws::{Message, WebSocket, WebSocketUpgrade}; -use axum::extract::State; -use axum::response::Response; -use axum::routing::get; -use axum::Router; -use futures_util::StreamExt; -use snow::{Builder, HandshakeState, TransportState}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; - -use super::{allowlist, State as AppState}; - -const NOISE_PATTERN: &str = "Noise_IK_25519_ChaChaPoly_BLAKE2s"; -const MAX_NOISE_MSG: usize = 65535; -/// Chunk size for raw PTY bytes flowing EE→client in attach mode. -/// Under `MAX_NOISE_MSG - 16` (auth tag) with plenty of headroom. -const ATTACH_CHUNK: usize = 4096; - -pub(crate) fn routes() -> Router { - Router::new().route("/noise/ws", get(upgrade)) -} - -async fn upgrade(ws: WebSocketUpgrade, State(state): State) -> Response { - ws.on_upgrade(move |socket| async move { - if let Err(e) = handle(socket, state).await { - eprintln!("noise-gw: session ended: {e:#}"); - } - }) -} - -async fn handle(mut socket: WebSocket, state: AppState) -> anyhow::Result<()> { - let static_private = state.attest.secret().to_bytes(); - - let mut hs: HandshakeState = Builder::new(NOISE_PATTERN.parse()?) - .local_private_key(&static_private) - .build_responder()?; - - // ── First handshake message (initiator → us) ──────────────────── - let Some(first) = next_binary(&mut socket).await? else { - anyhow::bail!("closed before first handshake message"); - }; - let mut payload_buf = [0u8; MAX_NOISE_MSG]; - hs.read_message(&first, &mut payload_buf)?; - - let remote_static = hs - .get_remote_static() - .ok_or_else(|| anyhow::anyhow!("Noise_IK requires a remote static key"))?; - if remote_static.len() != 32 { - anyhow::bail!("unexpected remote static length: {}", remote_static.len()); - } - let mut remote = [0u8; 32]; - remote.copy_from_slice(remote_static); - - let trusted = state.trust.read().await.contains(&remote); - if !trusted { - let _ = socket - .send(Message::Close(Some(axum::extract::ws::CloseFrame { - code: axum::extract::ws::close_code::POLICY, - reason: "unknown peer".into(), - }))) - .await; - anyhow::bail!("initiator pubkey not in trust list"); - } - - // ── Second handshake message (us → initiator) ─────────────────── - let mut second_buf = [0u8; MAX_NOISE_MSG]; - let n = hs.write_message(&[], &mut second_buf)?; - socket - .send(Message::Binary(second_buf[..n].to_vec().into())) - .await?; - - let mut transport: TransportState = hs.into_transport_mode()?; - - // ── Transport loop ────────────────────────────────────────────── - while let Some(frame) = next_binary(&mut socket).await? { - let mut plain = vec![0u8; frame.len()]; - let n = transport.read_message(&frame, &mut plain)?; - plain.truncate(n); - - let request: serde_json::Value = serde_json::from_slice(&plain) - .map_err(|e| anyhow::anyhow!("decrypted frame is not JSON: {e}"))?; - - match allowlist::classify(&request) { - Ok(allowlist::Method::Attach) => { - // Streaming path. attach_stream either hands us the EE - // socket + ack (happy) or returns an error we surface - // as a normal JSON response and keep the session in - // one-shot mode for the next request. - match state.upstream.attach_stream(request).await { - Ok((ack, ee_stream)) => { - send_encrypted_json(&mut transport, &mut socket, &ack).await?; - bridge_attach(&mut transport, &mut socket, ee_stream).await?; - return Ok(()); - } - Err(e) => { - let resp = serde_json::json!({ - "error": "attach_failed", - "detail": e.to_string(), - }); - send_encrypted_json(&mut transport, &mut socket, &resp).await?; - continue; - } - } - } - Ok(_method) => { - let response = state.upstream.call(request).await.unwrap_or_else(|e| { - serde_json::json!({ - "error": "upstream_failed", - "detail": e.to_string(), - }) - }); - send_encrypted_json(&mut transport, &mut socket, &response).await?; - } - Err(e) => { - let response = serde_json::json!({ - "error": "method_rejected", - "detail": e.to_string(), - }); - send_encrypted_json(&mut transport, &mut socket, &response).await?; - } - } - } - - Ok(()) -} - -async fn send_encrypted_json( - transport: &mut TransportState, - socket: &mut WebSocket, - value: &serde_json::Value, -) -> anyhow::Result<()> { - let plain = serde_json::to_vec(value)?; - send_encrypted_bytes(transport, socket, &plain).await -} - -async fn send_encrypted_bytes( - transport: &mut TransportState, - socket: &mut WebSocket, - plain: &[u8], -) -> anyhow::Result<()> { - let mut cipher = vec![0u8; plain.len() + 16]; - let n = transport.write_message(plain, &mut cipher)?; - cipher.truncate(n); - socket.send(Message::Binary(cipher.into())).await?; - Ok(()) -} - -/// Bridge WS ↔ EE socket as raw bytes for the life of one PTY. -/// Runs in the same future that owns the Noise `TransportState` so -/// we don't need a mutex around it — `select!` gives us concurrent -/// reads on both sides while serializing access to the transport. -async fn bridge_attach( - transport: &mut TransportState, - socket: &mut WebSocket, - ee_stream: tokio::net::UnixStream, -) -> anyhow::Result<()> { - let (mut ee_rd, mut ee_wr) = ee_stream.into_split(); - let mut ee_buf = [0u8; ATTACH_CHUNK]; - - loop { - tokio::select! { - biased; - // EE → client: raw PTY bytes, encrypted and forwarded. - n = ee_rd.read(&mut ee_buf) => { - match n? { - 0 => break, // EE closed (shell exited) - n => send_encrypted_bytes(transport, socket, &ee_buf[..n]).await?, - } - } - // Client → EE: decrypt and write stdin. - frame = next_binary(socket) => { - match frame? { - Some(cipher) => { - let mut plain = vec![0u8; cipher.len()]; - let n = transport.read_message(&cipher, &mut plain)?; - ee_wr.write_all(&plain[..n]).await?; - } - None => break, // WS closed - } - } - } - } - Ok(()) -} - -async fn next_binary(socket: &mut WebSocket) -> anyhow::Result>> { - while let Some(msg) = socket.next().await { - match msg? { - Message::Binary(b) => return Ok(Some(b.to_vec())), - Message::Close(_) => return Ok(None), - Message::Text(_) | Message::Ping(_) | Message::Pong(_) => continue, - } - } - Ok(None) -} diff --git a/src/noise_gateway/upstream.rs b/src/noise_gateway/upstream.rs deleted file mode 100644 index 066b9b6..0000000 --- a/src/noise_gateway/upstream.rs +++ /dev/null @@ -1,227 +0,0 @@ -//! Unix-socket client for easyenclave's agent socket. -//! -//! Wire protocol: one request/response per unix stream. Each request -//! is a single line of JSON (`{"method": "...", ...}`) terminated by -//! `\n`; the response is one line of JSON. `EE_TOKEN` (if present in -//! the process env at boot) is injected into every request envelope. -//! -//! `attach_stream` is the exception — EE replies with a one-line ack -//! and then the socket carries raw PTY bytes bidirectionally until -//! either side closes. The returned tuple is `(ack_json, socket)` -//! and the caller is responsible for byte-bridging. - -use std::path::PathBuf; - -use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}; -use tokio::net::UnixStream; - -pub const DEFAULT_EE_AGENT_SOCK: &str = "/var/lib/easyenclave/agent.sock"; - -pub struct EeAgent { - path: PathBuf, - token: Option, -} - -impl EeAgent { - pub fn new(path: PathBuf, token: Option) -> Self { - Self { path, token } - } - - /// Forward a request envelope to EE's agent socket, injecting the - /// `token` field if one was supplied at boot. Returns EE's raw - /// response value. - pub async fn call(&self, mut req: serde_json::Value) -> anyhow::Result { - self.inject_token(&mut req); - - let mut stream = UnixStream::connect(&self.path).await?; - let line = serde_json::to_vec(&req)?; - stream.write_all(&line).await?; - stream.write_all(b"\n").await?; - stream.flush().await?; - - let mut reader = BufReader::new(stream); - let mut resp = String::new(); - reader.read_line(&mut resp).await?; - let value: serde_json::Value = serde_json::from_str(resp.trim_end())?; - Ok(value) - } - - /// Send an `attach`-shaped request, read the one-line ack, and - /// return the socket so the caller can bridge raw PTY bytes. The - /// ack is forwarded to the caller so the Noise-side client sees - /// the same `{"ok": true, ...}` it would get from EE directly. - /// - /// Returns `Err` if EE's ack is `{"ok": false}` or malformed — in - /// that case the caller should not start bridging and should pass - /// the error back to the client as a normal one-shot response. - pub async fn attach_stream( - &self, - mut req: serde_json::Value, - ) -> anyhow::Result<(serde_json::Value, UnixStream)> { - self.inject_token(&mut req); - - let mut stream = UnixStream::connect(&self.path).await?; - let line = serde_json::to_vec(&req)?; - stream.write_all(&line).await?; - stream.write_all(b"\n").await?; - stream.flush().await?; - - // Read one line of ack byte-by-byte so the buffered reader - // doesn't swallow subsequent raw-stream bytes. - let mut ack = Vec::new(); - let mut byte = [0u8; 1]; - loop { - match stream.read(&mut byte).await? { - 0 => anyhow::bail!("EE attach: closed before ack"), - _ if byte[0] == b'\n' => break, - _ if ack.len() > 4096 => anyhow::bail!("EE attach: ack too long"), - _ => ack.push(byte[0]), - } - } - let ack_val: serde_json::Value = serde_json::from_slice(&ack)?; - if ack_val.get("ok").and_then(|v| v.as_bool()) != Some(true) { - anyhow::bail!("EE attach rejected: {ack_val}"); - } - Ok((ack_val, stream)) - } - - fn inject_token(&self, req: &mut serde_json::Value) { - if let Some(tok) = &self.token { - if let Some(obj) = req.as_object_mut() { - obj.insert("token".to_string(), serde_json::Value::String(tok.clone())); - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tokio::io::{AsyncReadExt, AsyncWriteExt}; - use tokio::net::UnixListener; - - async fn spawn_echo(path: PathBuf) { - let listener = UnixListener::bind(&path).unwrap(); - tokio::spawn(async move { - while let Ok((mut stream, _)) = listener.accept().await { - tokio::spawn(async move { - let mut buf = Vec::new(); - let mut one = [0u8; 1]; - while stream.read_exact(&mut one).await.is_ok() { - if one[0] == b'\n' { - break; - } - buf.push(one[0]); - } - let req: serde_json::Value = - serde_json::from_slice(&buf).unwrap_or(serde_json::json!({})); - let resp = serde_json::json!({ "echo": req }); - let mut line = serde_json::to_vec(&resp).unwrap(); - line.push(b'\n'); - let _ = stream.write_all(&line).await; - }); - } - }); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - } - - #[tokio::test] - async fn injects_token() { - let dir = tempfile::tempdir().unwrap(); - let sock = dir.path().join("ee.sock"); - spawn_echo(sock.clone()).await; - let agent = EeAgent::new(sock.clone(), Some("deadbeef".into())); - let resp = agent - .call(serde_json::json!({"method": "list"})) - .await - .unwrap(); - assert_eq!(resp["echo"]["token"], "deadbeef"); - assert_eq!(resp["echo"]["method"], "list"); - } - - #[tokio::test] - async fn omits_token_when_absent() { - let dir = tempfile::tempdir().unwrap(); - let sock = dir.path().join("ee.sock"); - spawn_echo(sock.clone()).await; - let agent = EeAgent::new(sock.clone(), None); - let resp = agent - .call(serde_json::json!({"method": "list"})) - .await - .unwrap(); - assert!(resp["echo"].get("token").is_none()); - } - - /// Fake EE that speaks the attach protocol: read one line, reply - /// `{"ok": true}\n`, then echo raw bytes until the client closes. - async fn spawn_attach_echo(path: PathBuf) { - let listener = UnixListener::bind(&path).unwrap(); - tokio::spawn(async move { - while let Ok((mut stream, _)) = listener.accept().await { - tokio::spawn(async move { - let mut req = Vec::new(); - let mut one = [0u8; 1]; - while stream.read_exact(&mut one).await.is_ok() { - if one[0] == b'\n' { - break; - } - req.push(one[0]); - } - let _ = stream.write_all(b"{\"ok\":true}\n").await; - let mut buf = [0u8; 64]; - while let Ok(n) = stream.read(&mut buf).await { - if n == 0 { - break; - } - if stream.write_all(&buf[..n]).await.is_err() { - break; - } - } - }); - } - }); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - } - - #[tokio::test] - async fn attach_stream_reads_ack_and_echoes() { - let dir = tempfile::tempdir().unwrap(); - let sock = dir.path().join("ee.sock"); - spawn_attach_echo(sock.clone()).await; - let agent = EeAgent::new(sock.clone(), None); - let (ack, mut stream) = agent - .attach_stream(serde_json::json!({"method": "attach", "cmd": ["bash"]})) - .await - .unwrap(); - assert_eq!(ack["ok"], true); - - stream.write_all(b"hello").await.unwrap(); - let mut buf = [0u8; 5]; - stream.read_exact(&mut buf).await.unwrap(); - assert_eq!(&buf, b"hello"); - } - - #[tokio::test] - async fn attach_stream_rejects_ok_false_ack() { - // Server replies `{"ok": false, "reason": "nope"}` and closes. - let dir = tempfile::tempdir().unwrap(); - let sock = dir.path().join("ee.sock"); - let listener = UnixListener::bind(&sock).unwrap(); - tokio::spawn(async move { - if let Ok((mut stream, _)) = listener.accept().await { - let mut discard = [0u8; 1]; - while stream.read_exact(&mut discard).await.is_ok() && discard[0] != b'\n' {} - let _ = stream - .write_all(b"{\"ok\":false,\"reason\":\"nope\"}\n") - .await; - } - }); - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - let agent = EeAgent::new(sock, None); - let err = agent - .attach_stream(serde_json::json!({"method": "attach"})) - .await - .unwrap_err(); - assert!(err.to_string().contains("rejected"), "got {err}"); - } -} diff --git a/src/stonith.rs b/src/stonith.rs deleted file mode 100644 index 71231b4..0000000 --- a/src/stonith.rs +++ /dev/null @@ -1,124 +0,0 @@ -//! STONITH: kill old CP VMs so the new one owns `$DD_HOSTNAME`. -//! -//! 1. **Tunnel-delete STONITH at startup** — list CF tunnels, find -//! CP tunnels (name starts with `dd-{env}-cp-`) that aren't ours, -//! delete them. Their cloudflared exits; their watchdog picks up -//! tunnel-gone and `poweroff`s within ~30 s. -//! 2. **Self-watchdog** — poll CF every 8–12 s for our own tunnel. -//! Three consecutive gone readings → poweroff. Catches the case -//! where we *are* the old CP being killed. - -use std::time::Duration; - -use rand::Rng; -use reqwest::Client; - -use crate::cf; -use crate::config::CfCreds; - -const POLL_BASE_SECS: u64 = 10; -const POLL_JITTER_SECS: u64 = 4; -const INITIAL_MIN_SECS: u64 = 10; -const INITIAL_MAX_SECS: u64 = 25; -const CONSECUTIVE_GONE: u32 = 3; - -/// Poweroff via reboot(2). Bypasses busybox's PID-1-is-systemd -/// assumption. Requires CAP_SYS_BOOT (we're root). Linux-only; on -/// other targets (developer workstations running `cargo test`) we -/// just abort, since there's no enclave to tear down. -#[cfg(target_os = "linux")] -pub fn poweroff() -> ! { - unsafe { - libc::sync(); - libc::reboot(libc::LINUX_REBOOT_CMD_POWER_OFF); - } - std::process::abort(); -} - -#[cfg(not(target_os = "linux"))] -pub fn poweroff() -> ! { - eprintln!("stonith: poweroff called on non-linux target — aborting process"); - std::process::abort(); -} - -/// Delete any CP tunnel (by name prefix) except our own. -pub async fn kill_old_tunnels(http: &Client, cf: &CfCreds, self_tunnel_id: &str, env: &str) { - let prefix = cf::cp_prefix(env); - let Ok(tunnels) = cf::list(http, cf).await else { - eprintln!("stonith: list failed"); - return; - }; - for t in &tunnels { - let Some(id) = t["id"].as_str() else { continue }; - if id == self_tunnel_id { - continue; - } - let Some(name) = t["name"].as_str() else { - continue; - }; - if !name.starts_with(&prefix) { - continue; - } - eprintln!("stonith: killing old CP tunnel {name} ({id})"); - cf::delete_by_name(http, cf, name).await; - } -} - -/// How long we give in-flight requests to finish after the tunnel is -/// confirmed deleted before forcing a poweroff. Buys time for streaming -/// Noise sessions + long polls on the old CP to drain cleanly. -const GRACEFUL_DRAIN: Duration = Duration::from_secs(30); - -/// Background watchdog — runs until the VM powers off. When the -/// tunnel goes away (new CP took over), fires `shutdown` to let axum -/// drain in-flight requests, waits up to `GRACEFUL_DRAIN`, then -/// powers off. -pub async fn self_watchdog( - cf: CfCreds, - self_tunnel_id: String, - shutdown: tokio::sync::broadcast::Sender<()>, -) -> ! { - let http = reqwest::Client::builder() - .timeout(Duration::from_secs(10)) - .build() - .unwrap_or_else(|_| reqwest::Client::new()); - - let initial = rand::thread_rng().gen_range(INITIAL_MIN_SECS..=INITIAL_MAX_SECS); - tokio::time::sleep(Duration::from_secs(initial)).await; - - let mut gone: u32 = 0; - loop { - match cf::exists(&http, &cf, &self_tunnel_id).await { - Some(true) => { - if gone > 0 { - eprintln!("stonith: watchdog recovered after {gone} missed check(s)"); - } - gone = 0; - } - Some(false) => { - gone += 1; - eprintln!("stonith: watchdog: tunnel gone ({gone}/{CONSECUTIVE_GONE})"); - if gone >= CONSECUTIVE_GONE { - eprintln!( - "stonith: tunnel {self_tunnel_id} confirmed deleted — draining ({}s) before poweroff", - GRACEFUL_DRAIN.as_secs() - ); - // Signal axum::serve(..).with_graceful_shutdown(..) - // to stop accepting new connections and let the - // live ones finish. `send` can only fail if every - // subscriber has been dropped, which is fine - // (nothing to drain anyway). - let _ = shutdown.send(()); - tokio::time::sleep(GRACEFUL_DRAIN).await; - poweroff(); - } - } - None => { - eprintln!("stonith: watchdog: ambiguous check result"); - } - } - let jitter = rand::thread_rng().gen_range(0..=POLL_JITTER_SECS); - let secs = POLL_BASE_SECS + jitter - POLL_JITTER_SECS / 2; - tokio::time::sleep(Duration::from_secs(secs)).await; - } -} diff --git a/src/taint.rs b/src/taint.rs deleted file mode 100644 index f3e386f..0000000 --- a/src/taint.rs +++ /dev/null @@ -1,119 +0,0 @@ -//! Integrity taint-reason tracking for a dd-agent node. -//! -//! An agent is either pristine (no reasons) or tainted (one or more -//! reasons). The spec (SATS_FOR_COMPUTE_SPEC.md) defines taint as a -//! SET of reasons — not a boolean — each tied to a specific mechanism -//! that let a non-fleet party influence the node. Third-party -//! verifiers who read `/health` reconstruct the node's trust profile -//! from the presence/absence of specific reasons: -//! -//! - `customer_workload_deployed + customer_owner_enabled + interactive_shell_enabled` -//! → full customer-deploy mode (shared admin, shell access). -//! - `customer_workload_deployed` only -//! → confidential mode (sealed oracle; no exec channels for anyone). -//! - empty set → pristine. -//! -//! v0 scope: taint is INFORMATIONAL. DD doesn't hard-block actions -//! based on it; the reasons just mirror what the node's boot config + -//! runtime events actually produced, for honest disclosure. - -use std::collections::HashSet; -use std::sync::Arc; - -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; - -/// One concrete mechanism through which a customer (or operator on -/// behalf of a customer) influenced the node post-boot. -#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, Serialize, Deserialize, Ord, PartialOrd)] -#[serde(rename_all = "snake_case")] -pub enum TaintReason { - /// `POST /owner` was invoked with a non-fleet tenant org; the - /// agent accepts that org's GitHub OIDC on `/deploy`/`/exec`/ - /// `/logs`. Does not include ops (`DD_OWNER`) using `/deploy`. - CustomerOwnerEnabled, - /// A workload was deployed via `POST /deploy` at runtime. True - /// whether the caller was ops or the tenant — any runtime deploy - /// moves the node away from pristine. - CustomerWorkloadDeployed, - /// The agent booted with `/deploy` + `/exec` routes enabled. - /// False in confidential mode (`DD_CONFIDENTIAL=true`), where - /// the mutation endpoints aren't registered at all. Derived at - /// boot from config; never toggled at runtime. - ArbitraryExecEnabled, - /// Interactive shell (ttyd or equivalent) is in the running - /// workload set. Reserved — not populated by v0. Left in the - /// enum so the `/health` schema is stable when it lands. - #[allow(dead_code)] - InteractiveShellEnabled, -} - -/// Thread-safe handle over a `HashSet`. Shared by the -/// axum state and the per-handler tainting code. Cheap to clone -/// (just an `Arc` bump). -#[derive(Clone, Default)] -pub struct TaintSet { - inner: Arc>>, -} - -impl TaintSet { - /// Seed the set with a boot-time reasons (e.g. `ArbitraryExecEnabled` - /// when the agent boots in non-confidential mode). - pub fn with_initial(reasons: impl IntoIterator) -> Self { - let set: HashSet<_> = reasons.into_iter().collect(); - Self { - inner: Arc::new(RwLock::new(set)), - } - } - - /// Insert a reason idempotently. Returns whether the reason was - /// newly added (`true` first time, `false` on subsequent calls). - pub async fn insert(&self, reason: TaintReason) -> bool { - self.inner.write().await.insert(reason) - } - - /// Snapshot the current set as a sorted `Vec` — stable ordering - /// so `/health` JSON is diff-friendly and the TDX quote's - /// embedded taint set has a canonical form. - pub async fn snapshot(&self) -> Vec { - let mut v: Vec<_> = self.inner.read().await.iter().copied().collect(); - v.sort(); - v - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn insert_is_idempotent() { - let s = TaintSet::default(); - assert!(s.insert(TaintReason::CustomerOwnerEnabled).await); - assert!(!s.insert(TaintReason::CustomerOwnerEnabled).await); - assert_eq!(s.snapshot().await, vec![TaintReason::CustomerOwnerEnabled]); - } - - #[tokio::test] - async fn snapshot_is_sorted() { - let s = TaintSet::with_initial([ - TaintReason::ArbitraryExecEnabled, - TaintReason::CustomerOwnerEnabled, - ]); - s.insert(TaintReason::CustomerWorkloadDeployed).await; - assert_eq!( - s.snapshot().await, - vec![ - TaintReason::CustomerOwnerEnabled, - TaintReason::CustomerWorkloadDeployed, - TaintReason::ArbitraryExecEnabled, - ] - ); - } - - #[test] - fn reasons_serialize_as_snake_case() { - let s = serde_json::to_string(&TaintReason::CustomerOwnerEnabled).unwrap(); - assert_eq!(s, "\"customer_owner_enabled\""); - } -} diff --git a/style.css b/style.css deleted file mode 100644 index 59bed56..0000000 --- a/style.css +++ /dev/null @@ -1,146 +0,0 @@ -:root { - --bg: #1e1e2e; - --bg-alt: #181825; - --surface: #313244; - --text: #cdd6f4; - --text-dim: #a6adc8; - --blue: #89b4fa; - --green: #a6e3a1; - --mauve: #cba6f7; - --red: #f38ba8; - --mono: 'JetBrains Mono', ui-monospace, 'Cascadia Code', 'Fira Code', monospace; - --sans: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif; -} -* { box-sizing: border-box; margin: 0; padding: 0; } -html { scroll-behavior: smooth; } -body { background: var(--bg); color: var(--text); font-family: var(--sans); line-height: 1.6; } -a { color: var(--blue); text-decoration: none; } -a:hover { text-decoration: underline; } -.container { max-width: 960px; margin: 0 auto; padding: 0 24px; } - -/* Nav */ -nav { display: flex; align-items: center; justify-content: space-between; padding: 16px 24px; max-width: 960px; margin: 0 auto; } -nav .logo { color: var(--blue); font-weight: 700; font-size: 18px; font-family: var(--mono); } -nav .links { display: flex; gap: 24px; align-items: center; } -nav .links a { color: var(--text-dim); font-size: 14px; } -nav .links a:hover { color: var(--text); text-decoration: none; } - -/* Hero */ -.hero { padding: 120px 24px 80px; text-align: center; max-width: 720px; margin: 0 auto; } -.hero h1 { font-size: clamp(32px, 5vw, 48px); font-weight: 800; line-height: 1.1; margin-bottom: 20px; } -.hero .accent { color: var(--green); } -.hero p { color: var(--text-dim); font-size: 18px; max-width: 560px; margin: 0 auto 32px; } -.hero .buttons { display: flex; gap: 12px; justify-content: center; flex-wrap: wrap; } -.btn { display: inline-block; padding: 12px 28px; border-radius: 8px; font-weight: 600; font-size: 15px; border: none; cursor: pointer; } -.btn-primary { background: var(--blue); color: var(--bg); } -.btn-primary:hover { opacity: 0.9; text-decoration: none; } -.btn-outline { border: 1px solid var(--surface); color: var(--text); background: transparent; } -.btn-outline:hover { border-color: var(--text-dim); text-decoration: none; } - -/* Stats */ -.stats { display: flex; justify-content: center; gap: 48px; margin-top: 48px; padding-top: 32px; border-top: 1px solid var(--surface); flex-wrap: wrap; } -.stat { text-align: center; } -.stat .value { display: block; font-family: var(--mono); font-size: 14px; color: var(--green); font-weight: 700; } -.stat .label { font-size: 13px; color: var(--text-dim); } - -/* Section */ -section { padding: 80px 24px; } -section h2 { font-size: 28px; font-weight: 700; margin-bottom: 12px; } -section .subtitle { color: var(--text-dim); font-size: 16px; margin-bottom: 40px; max-width: 560px; } -.section-center { text-align: center; } -.section-center .subtitle { margin-left: auto; margin-right: auto; } - -/* Grid */ -.grid-3 { display: grid; grid-template-columns: repeat(auto-fit, minmax(260px, 1fr)); gap: 20px; max-width: 960px; margin: 0 auto; } -.card { background: var(--bg-alt); border: 1px solid var(--surface); border-radius: 12px; padding: 28px; } -.card .icon { font-size: 28px; margin-bottom: 12px; } -.card h3 { font-size: 17px; font-weight: 600; margin-bottom: 8px; } -.card p { color: var(--text-dim); font-size: 14px; line-height: 1.5; } - -/* Steps */ -.steps { max-width: 640px; margin: 0 auto; display: flex; flex-direction: column; gap: 24px; } -.step { display: flex; gap: 20px; align-items: flex-start; } -.step-num { flex-shrink: 0; width: 36px; height: 36px; border-radius: 50%; background: var(--surface); color: var(--blue); font-family: var(--mono); font-weight: 700; font-size: 15px; display: flex; align-items: center; justify-content: center; } -.step-content h3 { font-size: 16px; font-weight: 600; margin-bottom: 4px; } -.step-content p { color: var(--text-dim); font-size: 14px; } - -/* Architecture */ -.arch { background: var(--bg-alt); border: 1px solid var(--surface); border-radius: 12px; padding: 32px; max-width: 720px; margin: 0 auto; overflow-x: auto; } -.arch pre { font-family: var(--mono); font-size: 13px; color: var(--text-dim); line-height: 1.7; white-space: pre; } -.arch .hl { color: var(--blue); } -.arch .gr { color: var(--green); } - -/* Code */ -.code-block { background: var(--bg-alt); border: 1px solid var(--surface); border-radius: 12px; overflow: hidden; max-width: 640px; margin: 0 auto; } -.code-header { padding: 10px 16px; border-bottom: 1px solid var(--surface); font-family: var(--mono); font-size: 12px; color: var(--text-dim); } -.code-block pre { padding: 20px; font-family: var(--mono); font-size: 13px; line-height: 1.6; overflow-x: auto; } -.code-block .k { color: var(--mauve); } -.code-block .s { color: var(--green); } -.code-block .c { color: var(--text-dim); font-style: italic; } - -/* Callout */ -.callout { background: var(--bg-alt); border: 1px solid var(--surface); border-radius: 12px; padding: 40px; text-align: center; max-width: 640px; margin: 0 auto; } -.callout h3 { font-size: 20px; margin-bottom: 12px; } -.callout p { color: var(--text-dim); font-size: 15px; margin-bottom: 20px; } - -/* Two-mode comparison */ -.modes { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-top: 32px; } -.mode { background: var(--bg-alt); border: 1px solid var(--surface); border-radius: 10px; padding: 24px; } -.mode h3 { margin: 12px 0 12px; font-size: 18px; } -.mode p { color: var(--text); font-size: 14px; margin-bottom: 12px; line-height: 1.6; } -.mode p.dim { color: var(--text-dim); font-style: italic; font-size: 13px; } -.mode-tag { - display: inline-block; padding: 3px 10px; border-radius: 4px; - font-size: 11px; font-weight: 600; letter-spacing: 0.05em; text-transform: uppercase; - font-family: var(--mono); -} -.mode-customer h3 { color: var(--mauve); } -.mode-customer .mode-tag { background: rgba(203, 166, 247, 0.15); color: var(--mauve); } -.mode-confidential h3 { color: #fab387; } -.mode-confidential .mode-tag { background: rgba(250, 179, 135, 0.15); color: #fab387; } - -/* Trust model table */ -.trust-table { border-collapse: collapse; width: 100%; margin: 24px 0 16px; } -.trust-table td { - vertical-align: top; padding: 14px 12px; - border-bottom: 1px solid var(--surface); - font-size: 14px; color: var(--text); - line-height: 1.6; -} -.trust-table td:first-child { width: 140px; white-space: nowrap; } -.state-pill { - display: inline-block; padding: 3px 10px; border-radius: 4px; - font-size: 12px; font-weight: 600; letter-spacing: 0.04em; text-transform: uppercase; - font-family: var(--mono); -} -.state-pristine { background: rgba(166, 227, 161, 0.18); color: var(--green); } -.state-tainted { background: rgba(250, 179, 135, 0.18); color: #fab387; } -.state-safe { background: rgba(137, 180, 250, 0.18); color: var(--blue); } -.reasons { list-style: none; padding-left: 0; } -.reasons li { padding: 4px 0; font-size: 13px; color: var(--text-dim); } -.reasons li code { color: var(--text); } -.trust-cta { - margin-top: 16px; padding: 14px 16px; - background: rgba(250, 179, 135, 0.08); border-left: 3px solid #fab387; border-radius: 0 6px 6px 0; - font-size: 14px; color: var(--text); line-height: 1.6; -} - -/* CTA */ -.cta { text-align: center; padding: 80px 24px; } -.cta h2 { font-size: 28px; margin-bottom: 12px; } -.cta p { color: var(--text-dim); margin-bottom: 24px; } - -/* Footer */ -footer { border-top: 1px solid var(--surface); padding: 32px 24px; text-align: center; } -footer p { color: var(--text-dim); font-size: 13px; } -footer a { color: var(--text-dim); margin: 0 12px; } -footer a:hover { color: var(--text); } - -@media (max-width: 600px) { - nav .links { display: none; } - .hero { padding: 80px 16px 48px; } - .stats { gap: 24px; } - .grid-3 { grid-template-columns: 1fr; } - .modes { grid-template-columns: 1fr; } - .trust-table td:first-child { width: auto; } -}