From b749f9544091bcfac5bd656114f27123c33dc628 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Thu, 14 May 2026 12:37:29 +0800 Subject: [PATCH 1/5] feat(buy-external): add KEEP_CLUSTER_ON_FAIL knob + diagnostic snapshot on FAIL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `flows/buy-external.sh` fails (typically at step 14, the `buy.py buy` invocation), the existing `external_cleanup` immediately tears the cluster down — destroying the only places that record why the PurchaseRequest never advanced (controller logs, PR `status.conditions[]`, sidecar `/status`). This commit: - Adds `external_snapshot_on_fail()` — best-effort capture of controller logs (current + `--previous`), PurchaseRequest YAML across all namespaces, buyer sidecar `/status` (via `kubectl exec ... python3` against the litellm container — buyer container is distroless), `cluster-pods.txt`, and recent `cluster-events.txt`. All commands wrapped in `|| true` so a single failure doesn't abort the bundle. Empty/failed files are removed. - Calls the snapshot from `external_cleanup` BEFORE any teardown, on the failure path only — clean exits keep the existing fast-cleanup behavior. - Honors `KEEP_CLUSTER_ON_FAIL=1` (default unset) — when set, skips `bob stack down` after the snapshot bundle is written and prints the preserved stack id + artifact dir + manual cleanup hint. Unblocks investigation of v1337-style external-seller failures documented in plans/inference-v1337-buy-report-20260514.md. --- flows/buy-external.sh | 94 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-) diff --git a/flows/buy-external.sh b/flows/buy-external.sh index 7b085a88..c69cf49e 100755 --- a/flows/buy-external.sh +++ b/flows/buy-external.sh @@ -180,6 +180,84 @@ fi PF_LITELLM="" PF_LITELLM_LOG="" +# Best-effort diagnostic snapshot, taken on the failure path BEFORE the cluster +# is torn down. Each command is wrapped in `|| true` so a single failure does +# not abort the rest of the bundle. The sidecar status snapshot uses the same +# `kubectl exec ... python3 -c` shape as the in-flow before/after captures +# (the buyer container is distroless — no curl/wget). +external_snapshot_on_fail() { + type bob >/dev/null 2>&1 || return 0 + [ -d "$EXTERNAL_BUY_ARTIFACT_DIR" ] || return 0 + + local f + + f="$EXTERNAL_BUY_ARTIFACT_DIR/controller.log" + if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 --previous \ + > "$f" 2>/dev/null; then + echo " snapshot: $f" + else + if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 \ + > "$f" 2>/dev/null; then + echo " snapshot: $f (no --previous available)" + else + rm -f "$f" 2>/dev/null || true + fi + fi + + f="$EXTERNAL_BUY_ARTIFACT_DIR/controller-current.log" + if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 \ + > "$f" 2>/dev/null; then + echo " snapshot: $f" + else + rm -f "$f" 2>/dev/null || true + fi + + f="$EXTERNAL_BUY_ARTIFACT_DIR/purchaserequest.yaml" + if bob kubectl get purchaserequest -A -o yaml > "$f" 2>/dev/null; then + echo " snapshot: $f" + else + rm -f "$f" 2>/dev/null || true + fi + + f="$EXTERNAL_BUY_ARTIFACT_DIR/buyer-status-after.json" + if bob kubectl exec -n llm deployment/litellm -c litellm -- \ + python3 -c " +import urllib.request, json +try: + resp = urllib.request.urlopen('http://localhost:8402/status', timeout=5) + print(json.dumps(json.loads(resp.read()), indent=2)) +except Exception as e: + print(json.dumps({'error': repr(e)})) +" > "$f" 2>/dev/null; then + echo " snapshot: $f" + else + rm -f "$f" 2>/dev/null || true + fi + + # Re-use the harness-captured buy.py log if it was written; do not re-fetch. + if [ -f "$EXTERNAL_BUY_ARTIFACT_DIR/buy-py.log" ]; then + f="$EXTERNAL_BUY_ARTIFACT_DIR/agent-pod-buypy.log" + if cp "$EXTERNAL_BUY_ARTIFACT_DIR/buy-py.log" "$f" 2>/dev/null; then + echo " snapshot: $f" + fi + fi + + f="$EXTERNAL_BUY_ARTIFACT_DIR/cluster-pods.txt" + if bob kubectl get pods -A -o wide > "$f" 2>/dev/null; then + echo " snapshot: $f" + else + rm -f "$f" 2>/dev/null || true + fi + + f="$EXTERNAL_BUY_ARTIFACT_DIR/cluster-events.txt" + if bob kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null \ + | tail -100 > "$f" 2>/dev/null && [ -s "$f" ]; then + echo " snapshot: $f" + else + rm -f "$f" 2>/dev/null || true + fi +} + external_cleanup() { local ec=$? set +e @@ -190,7 +268,21 @@ external_cleanup() { # tear it down if the flow already failed — a leaked k3d cluster between # runs eats Docker network space (cleanup_k3d_obol_networks reclaims). if [ "$ec" -ne 0 ] && type bob >/dev/null 2>&1; then - bob stack down >/dev/null 2>&1 || true + # Snapshot diagnostics BEFORE the cluster goes away — these are the + # only places that record why the PurchaseRequest never advanced. + echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR" + external_snapshot_on_fail + + if [ "${KEEP_CLUSTER_ON_FAIL:-0}" = "1" ]; then + echo "" + echo "KEEP_CLUSTER_ON_FAIL=1 → cluster preserved." + echo " Stack id: $PINNED_STACK_ID" + echo " Artifacts: $EXTERNAL_BUY_ARTIFACT_DIR" + echo " Manual cleanup when done:" + echo " bob stack down" + else + bob stack down >/dev/null 2>&1 || true + fi fi cleanup_k3d_obol_networks set -e From eb130558374294ed37830621e6754dc7a5622ef0 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Thu, 14 May 2026 12:37:43 +0800 Subject: [PATCH 2/5] fix(flows): pick freshest of .build/obol vs .workspace/bin/obol in bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `bootstrap_flow_workspace` previously copied unconditionally from the caller-supplied path (always `$OBOL_ROOT/.build/obol`). When iterating on embedded skill content (e.g. `internal/embed/skills/buy-x402/scripts/buy.py`) it's easy to rebuild one of the two binaries and forget the other, silently baking pre-fix files into the cluster PVC via `syncObolSkills`. Burned six hours during the v1337 live-buy investigation (attempt 5 in plans/inference-v1337-buy-report-20260514.md). Now: stat both paths, pick the one with the larger mtime, and emit a 5-line WARN to stderr when the two differ by more than 5 minutes — header + both paths-with-mtimes + which one was picked + a one-line rebuild nudge. Cross- OS stat handled via `stat -c %Y` with `stat -f %m` fallback. Date formatted with `date -r ` (BSD/macOS friendly), GNU `date -u -d "@"` fallback. Contract preserved (no return value, copies into `$dir/bin/obol`). --- flows/lib.sh | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/flows/lib.sh b/flows/lib.sh index d67fe512..036ed940 100755 --- a/flows/lib.sh +++ b/flows/lib.sh @@ -517,9 +517,46 @@ bootstrap_flow_workspace() { local dir="$1" local obol_bin="$2" local tool src + local workspace_bin="$OBOL_ROOT/.workspace/bin/obol" + local picked="$obol_bin" + local picked_mtime other_mtime delta abs_delta + + # Pick the freshest of the caller-supplied binary and the workspace binary. + # During iteration on embedded skill content (e.g. buy-x402/scripts/buy.py) + # it is easy to rebuild one and forget the other; copying the stale one + # silently bakes pre-fix files into the cluster PVC. See pitfall in + # plans/inference-v1337-buy-report-20260514.md (v1337 attempt 5). + if [ -f "$obol_bin" ] && [ -f "$workspace_bin" ] && [ "$obol_bin" != "$workspace_bin" ]; then + picked_mtime=$(stat -c %Y "$obol_bin" 2>/dev/null || stat -f %m "$obol_bin" 2>/dev/null || echo 0) + other_mtime=$(stat -c %Y "$workspace_bin" 2>/dev/null || stat -f %m "$workspace_bin" 2>/dev/null || echo 0) + if [ "$other_mtime" -gt "$picked_mtime" ]; then + picked="$workspace_bin" + delta=$((picked_mtime - other_mtime)) + else + delta=$((other_mtime - picked_mtime)) + fi + abs_delta=${delta#-} + if [ "$abs_delta" -gt 300 ]; then + local fmt_a fmt_b picked_fmt + fmt_a=$(date -r "$obol_bin" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -d "@$picked_mtime" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "?") + fmt_b=$(date -r "$workspace_bin" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -d "@$other_mtime" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "?") + if [ "$picked" = "$workspace_bin" ]; then + picked_fmt="$workspace_bin (mtime $fmt_b)" + else + picked_fmt="$obol_bin (mtime $fmt_a)" + fi + echo " WARN: obol binary mtimes differ by ${abs_delta}s — one of these was likely forgotten in a rebuild" >&2 + echo " $obol_bin mtime $fmt_a" >&2 + echo " $workspace_bin mtime $fmt_b" >&2 + echo " picked: $picked_fmt" >&2 + echo " Rebuild both with \`go build -o .build/obol ./cmd/obol && go build -o .workspace/bin/obol ./cmd/obol\` if you've been iterating on embedded skill content." >&2 + fi + elif [ ! -f "$obol_bin" ] && [ -f "$workspace_bin" ]; then + picked="$workspace_bin" + fi reset_flow_workspace "$dir" - cp "$obol_bin" "$dir/bin/obol" + cp "$picked" "$dir/bin/obol" chmod +x "$dir/bin/obol" for tool in kubectl helm helmfile k3d k9s openclaw; do src=$(command -v "$tool" 2>/dev/null || printf '%s\n' "$OBOL_ROOT/.workspace/bin/$tool") From 849cd93d751e0d327f56f75d4ea38fa506aab23e Mon Sep 17 00:00:00 2001 From: bussyjd Date: Thu, 14 May 2026 12:37:54 +0800 Subject: [PATCH 3/5] docs(skill): document Cloudflare-WAF UA pitfall + Go-side follow-up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds entry #10 to the release-smoke debugging reference covering the HTTP 403 + Cloudflare error 1010 we hit on v1337 attempts 3–4: managed WAF rules block the default `Python-urllib/X.Y` UA. Documents the buy.py fix (commit c2dddc1) plus the unconfirmed-but-likely Go-side follow-up at internal/serviceoffercontroller/purchase.go:183, where Go's `http.Client` defaults to `User-Agent: Go-http-client/1.1` and may hit the same WAF block on the controller probe. --- .../obol-stack-dev/references/release-smoke-debugging.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md b/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md index c27bd913..f08209d9 100644 --- a/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md +++ b/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md @@ -99,6 +99,14 @@ Earlier `1.4.9` of `ghcr.io/obolnetwork/x402-facilitator-prometheus-overlay` shi - **Fixed upstream**: `ObolNetwork/x402-rs#3` (merged 2026-05-13, `668b7bb`) dropped the platform pin. The publish workflow republished `1.4.9` on push to `main`; arm64 digest is now `sha256:b209345c5e05415df36444b307213c61f9ca08db9f8131d0ebfebefc244ba4ec`. - **`X402_FACILITATOR_SKIP_PULL` knob removed** from `flows/lib.sh` once the republished image was validated against the release-smoke. If you encounter `exec format error` on an arm64 host now, the registry image is wrong, not the host — pull-fresh (`docker pull ghcr.io/obolnetwork/x402-facilitator-prometheus-overlay:1.4.9`) and check the manifest with `docker buildx imagetools inspect`. +### 10. Cloudflare WAF blocks default `Python-urllib` User-Agent on external sellers + +When buying from external x402 sellers (sellers running outside our k3d cluster — e.g. `https://inference.v1337.org/...`), some sit behind Cloudflare's managed WAF, which **blocks the default `Python-urllib/X.Y` UA with HTTP 403 + Cloudflare error 1010** ("the owner of this website has banned your access based on your browser's signature"). Both the unpaid 402 probe and the paid `X-PAYMENT` request fail; buyers see misleading auth/signing errors instead of the real cause. + +- **Symptom**: `buy.py probe` against an external seller fails with 403 (often surfaced as a JSON-decode error or "no accepts" downstream); `buy.py buy` against the same endpoint also fails before signature verification. Curl with default browser UA against the same URL returns 402 cleanly. +- **Fix in repo**: `c2dddc1` — added module-level `USER_AGENT = os.environ.get("OBOL_BUYER_USER_AGENT", "obol-buy-x402/1.0 (+https://github.com/ObolNetwork/obol-stack)")` to `internal/embed/skills/buy-x402/scripts/buy.py`, applied in `_probe_endpoint` (kind=http), `_probe_endpoint` (kind=inference), and the paid `X-PAYMENT` request in `buy_paid_oneshot`. Tested four UAs against v1337 (`curl/*`, generic `Mozilla/*`, `Chrome/*`, custom `obol-buy-x402/*`) — all four returned 402 cleanly. The fix is "send anything that isn't `Python-urllib`", not "send a specific browser UA". Operator override: `OBOL_BUYER_USER_AGENT`. +- **Follow-up (not yet confirmed)**: the same WAF block likely affects the Go-side controller probe at `internal/serviceoffercontroller/purchase.go:183`, since Go's `http.Client` defaults to `User-Agent: Go-http-client/1.1`. Verify against v1337 and apply the same UA override on the Go side if reproduced. + ## Diagnostic Patterns - **Don't confuse 503 with "verifier broken"** — almost always one of #1, #2, #5, #6, or a missing CA bundle (`paid-flows.md`). From 82108c3af1faf0d97f8e061aa24265c3ff8a2dfb Mon Sep 17 00:00:00 2001 From: bussyjd Date: Thu, 14 May 2026 15:05:41 +0800 Subject: [PATCH 4/5] =?UTF-8?q?docs(plans):=20retract=20v1337=20controller?= =?UTF-8?q?-gap=20hypothesis=20=E2=80=94=20controller=20works?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-ran the v1337 buy with the new KEEP_CLUSTER_ON_FAIL=1 knob (commit b749f95). The controller reconciled the PurchaseRequest in 55 seconds through Probed → AuthsLoaded → Configured → Ready, against the same external endpoint the original report failed on. The original report's central technical claim — "serviceoffer-controller does not reconcile PurchaseRequests for external sellers" — is false. The controller is endpoint-agnostic by design (verified by code review of internal/serviceoffercontroller/purchase.go). Attempt 5's reconcile-hang was almost certainly a kubectl-exec session SIGKILL (exit 137), not a controller bug — likely harness-side run_with_timeout firing while buy.py was still polling normally. Today's run did surface a real but unrelated quirk: LiteLLM's POST /model/new fails with EROFS because /etc/litellm/config.yaml is mounted read-only as a Kubernetes ConfigMap volume; the controller catches this and falls back to ConfigMap reload, which works fine. Pre-existing, worth one line in paid-flows.md so the next debugger isn't startled. Step 18 (paid request) failed for an operator-error reason: I picked qwen3.6-27b as the upstream model id, but v1337's vLLM serves under a different name. Bob's 0.023 OBOL was NOT consumed (LiteLLM 404'd before the buyer sidecar could settle). Companion to plans/inference-v1337-buy-report-20260514.md. Retracts follow-up #1 of that report. --- plans/inference-v1337-followup-20260514.md | 102 +++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 plans/inference-v1337-followup-20260514.md diff --git a/plans/inference-v1337-followup-20260514.md b/plans/inference-v1337-followup-20260514.md new file mode 100644 index 00000000..9c95425b --- /dev/null +++ b/plans/inference-v1337-followup-20260514.md @@ -0,0 +1,102 @@ +# Live buy from `inference.v1337.org` — follow-up findings + +Date: 2026-05-14 +Test bed: spark1 (Linux aarch64) +Worktree: `/home/claude/obol-stack-qa-20260513-135712-post490` +Branch HEAD during the test: `eb13055` (`chore/buy-external-followups`) +Companion to: `plans/inference-v1337-buy-report-20260514.md` + +## TL;DR + +Re-ran the v1337 buy with `KEEP_CLUSTER_ON_FAIL=1` (new knob in `flows/buy-external.sh`, commit `b749f95`). Steps 1–17 PASS. The controller reconciled the PurchaseRequest in 55 seconds (`Probed` → `AuthsLoaded` → `Configured` → `Ready`), publishing `paid/qwen3.6-27b` via the buyer sidecar. **The "controller external-seller mode gap" hypothesis from the original report is false** — the controller is endpoint-agnostic by design (confirmed by code review of `internal/serviceoffercontroller/purchase.go`) and works against arbitrary external x402 sellers without modification. + +The original report's attempt 5 failure (`command terminated with exit code 137` at the controller-reconcile wait) was almost certainly a kubectl-exec session SIGKILL — not a controller hang. Today's identical code path completed cleanly in 0s at the same step. + +## What today's run actually showed + +PR `status.conditions[]` from the captured `purchaserequest.yaml`: + +```yaml +conditions: +- type: Probed status: "True" reason: Validated message: "402: 23000000000000000 on eip155:84532" 06:37:06Z +- type: AuthsLoaded status: "True" reason: Loaded message: "Loaded 1 pre-signed auths from spec" 06:37:06Z +- type: Configured status: "True" reason: Written message: "Wrote 1 auths to llm/x402-buyer-auths" 06:37:06Z +- type: Ready status: "True" reason: Reconciled message: "Sidecar: 1 remaining, 0 spent" 06:38:01Z +observedGeneration: 1 +publicModel: paid/qwen3.6-27b +remaining: 1 +totalSigned: 1 +signerAddress: 0x57b0eF875DeB5A37301F1640E469a2129Da9490E +``` + +`buyer-status-after.json`: + +```json +{"v1337-aeon": {"url": "https://inference.v1337.org/services/aeon", "remote_model": "qwen3.6-27b", "public_model": "paid/qwen3.6-27b", "remaining": 1, "spent": 0, "network": "base-sepolia"}} +``` + +The Go-side probe at `purchase.go:183` was NOT WAF-blocked. The follow-up worry (entry #10 in `release-smoke-debugging.md`) about Cloudflare's WAF blocking `Go-http-client/1.1` UA does not reproduce against v1337. Worth keeping the doc note for the general class of WAF UA filters, but not a load-bearing concern for the controller code path. + +## Why attempt 5 looked like a controller hang + +Today the harness completed step 14 (`buy.py completed`) with the same code that produced attempt 5's exit-code-137. Two structural differences plausibly explain why attempt 5 hung where today's run sailed: + +1. **`bootstrap_flow_workspace` now picks the freshest binary** (commit `eb13055`). Attempt 5 silently used a stale `.build/obol` whose embedded buy.py lacked the USER_AGENT fix. Even after the operator rebuilt `.build/obol` mid-attempt, the Bob workspace had already been bootstrapped from the older copy. The PVC's buy.py wrote a probe with `Python-urllib` UA → 403 from CF → the controller's view of the world differed from the buyer's view in subtle ways. The `eb13055` fix removes that footgun for future runs. + +2. **The kubectl-exec SIGKILL was an environmental artifact.** `command terminated with exit code 137` is what kubectl prints when its remote process dies from SIGKILL — could be harness `run_with_timeout`, OOM, or control-plane jitter. None of those would be visible in the controller logs (which today's `KEEP_CLUSTER_ON_FAIL=1` snapshot proved go quiet during the wait). Today's harness completed the same `obol kubectl exec` to buy.py without issue, so the SIGKILL was not deterministic. + +## The actual blocker today + +Step 18 (paid request through LiteLLM) failed: + +``` +FAIL: [18] Paid request returned HTTP 404 +{"error":{"message":"litellm.NotFoundError: NotFoundError: OpenAIException - The model `qwen3.6-27b` does not exist.. Received Model Group=paid/qwen3.6-27b\nAvailable Model Group Fallbacks=None","type":null,"param":null,"code":"404"}} +``` + +This is operator-error model-name mismatch: + +- LiteLLM correctly routed `paid/qwen3.6-27b` → buyer sidecar → `https://inference.v1337.org/services/aeon`. +- v1337's upstream vLLM does not serve a model named `qwen3.6-27b`. +- The actual model name is unknown from `/.well-known/agent-registration.json` (which advertises display name "Qwen3.6-27B AEON Ultimate" and skills `llm/inference, llm/uncensored`, but no model id). + +Bob's 0.023 OBOL pre-signed auth was **NOT consumed** — LiteLLM 404'd before reaching the buyer sidecar's `/settle` path. Wallet balance unchanged. + +To finish the live buy proof, the harness needs the right `--model` value. Options: (a) ask the seller, (b) probe `/v1/models` if v1337 makes it free, (c) brute-force common variants (`aeon`, `qwen-3.6-27b`, `qwen3.6`, `qwen3.6-27b-aeon`). All low-priority — the controller-side answer is already in. + +## Side finding: LiteLLM hot-add quirk + +The controller logs surfaced: + +``` +purchase: hot-add paid/qwen3.6-27b failed: POST /model/new: 400 Bad Request: +{"error":{"message":"Authentication Error, [Errno 30] Read-only file system: '/etc/litellm/config.yaml'", ...}}; relying on ConfigMap reload +``` + +LiteLLM's `/model/new` API tries to write back to `/etc/litellm/config.yaml`. In our deployment that path is a Kubernetes ConfigMap volume — read-only by default. The controller catches the 400 and falls back to the ConfigMap-reload path, which works (the alias DID become available, otherwise step 17 wouldn't have passed). Pre-existing behavior, not external-seller specific. Worth a one-line note in `paid-flows.md` so the next debugger isn't startled by the WARN in controller logs. + +## Updates to original report + +Replace follow-up #1 ("serviceoffer-controller external-seller mode") with: "RESOLVED — controller is endpoint-agnostic by design. Attempt 5's reconcile-hang was a kubectl-exec SIGKILL artifact, not a controller bug. Verified 2026-05-14 with `KEEP_CLUSTER_ON_FAIL=1` re-run." + +Follow-up #2 (harness binary path) — DONE in commit `eb13055`. +Follow-up #3 (CF-WAF UA documentation) — DONE in commit `849cd93`. +Follow-up #4 (`KEEP_CLUSTER_ON_FAIL` knob) — DONE in commit `b749f95`. + +The original report still has narrative value for the four bug fixes it surfaced (k3d cluster-name cap, CAIP-2 chain id mismatch, CF-WAF Python-urllib UA, stale `.build/obol`). Only the controller hypothesis was wrong. + +## Artifacts + +Under `/home/claude/obol-stack-qa-20260513-135712-post490/.tmp/v1337-rerun-20260514-063232-artifacts/` on spark1, captured by the new `external_snapshot_on_fail()`: + +- `controller.log`, `controller-current.log` — full reconcile trace +- `purchaserequest.yaml` — the conclusive `Ready=True` proof +- `buyer-status-after.json` — sidecar saw 1 remaining, 0 spent +- `agent-pod-buypy.log` — clean `buy.py` run through PR creation +- `cluster-pods.txt`, `cluster-events.txt` — full cluster state at FAIL + +The Bob k3d cluster (`obol-stack-buy-ext-bob`) is preserved on spark1 pending teardown. + +## Closing note + +The phase-1 polish items in `chore/buy-external-followups` more than paid for themselves on the first re-run: `KEEP_CLUSTER_ON_FAIL=1` made the diagnosis trivial, the binary normalization removed one of the candidate causes for attempt 5's hang, and the diagnostic snapshot bundle gave us seven artifacts that took a single bash command to inspect. The original v1337 report would have been wrong on its central technical claim if we hadn't re-run with these in place — a useful argument for keeping operator-level diagnostic ergonomics ahead of feature work. From df5fcffc31aeabe3e7505accd28413aede2b62f7 Mon Sep 17 00:00:00 2001 From: bussyjd Date: Fri, 15 May 2026 09:41:30 +0800 Subject: [PATCH 5/5] refactor(buy-external): green-only cleanup gate, drop KEEP_CLUSTER_ON_FAIL knob MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the opt-in KEEP_CLUSTER_ON_FAIL=1 env knob (added in b749f95) with an unconditional rule: cleanup happens iff every step passes. On FAIL, snapshot the diagnostic bundle and preserve the cluster — every time, no env override needed. Also inverts the prior success-side default. The previous design left the cluster up on success "so the operator can poke around"; in practice operators re-ran the harness from scratch when they wanted fresh state, and the leftover cluster mostly leaked across runs. With the new gate, a green run leaves a clean machine. Net behavior: - success → bob stack down (clean state for next run) - failure → snapshot + preserve (operator pays one manual teardown when done diagnosing) The diagnostic snapshot helper from b749f95 is unchanged; only the preservation gate moved from an env knob to the implicit pass/fail state. --- flows/buy-external.sh | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/flows/buy-external.sh b/flows/buy-external.sh index c69cf49e..002d0d2b 100755 --- a/flows/buy-external.sh +++ b/flows/buy-external.sh @@ -264,24 +264,23 @@ external_cleanup() { [ -n "$PF_LITELLM" ] && cleanup_pid "$PF_LITELLM" 2>/dev/null [ -n "$PF_LITELLM_LOG" ] && rm -f "$PF_LITELLM_LOG" 2>/dev/null - # Leave the cluster up on success so the operator can poke around. Only - # tear it down if the flow already failed — a leaked k3d cluster between - # runs eats Docker network space (cleanup_k3d_obol_networks reclaims). - if [ "$ec" -ne 0 ] && type bob >/dev/null 2>&1; then - # Snapshot diagnostics BEFORE the cluster goes away — these are the - # only places that record why the PurchaseRequest never advanced. - echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR" - external_snapshot_on_fail - - if [ "${KEEP_CLUSTER_ON_FAIL:-0}" = "1" ]; then + # Cleanup gate: tear down only when every step passed. On FAIL, snapshot + # diagnostics and preserve the cluster — the only places that record why + # a PurchaseRequest never advanced are the controller logs, PR + # status.conditions[], and sidecar /status, all of which die with the + # cluster. Operator pays one manual `bob stack down` when done diagnosing. + if type bob >/dev/null 2>&1; then + if [ "$ec" -eq 0 ]; then + bob stack down >/dev/null 2>&1 || true + else + echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR" + external_snapshot_on_fail echo "" - echo "KEEP_CLUSTER_ON_FAIL=1 → cluster preserved." + echo "FAIL → cluster preserved for diagnosis." echo " Stack id: $PINNED_STACK_ID" echo " Artifacts: $EXTERNAL_BUY_ARTIFACT_DIR" echo " Manual cleanup when done:" echo " bob stack down" - else - bob stack down >/dev/null 2>&1 || true fi fi cleanup_k3d_obol_networks