From b749f9544091bcfac5bd656114f27123c33dc628 Mon Sep 17 00:00:00 2001
From: bussyjd <bussyjd@users.noreply.github.com>
Date: Thu, 14 May 2026 12:37:29 +0800
Subject: [PATCH 1/5] feat(buy-external): add KEEP_CLUSTER_ON_FAIL knob +
 diagnostic snapshot on FAIL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When `flows/buy-external.sh` fails (typically at step 14, the `buy.py buy`
invocation), the existing `external_cleanup` immediately tears the cluster
down — destroying the only places that record why the PurchaseRequest never
advanced (controller logs, PR `status.conditions[]`, sidecar `/status`).

This commit:

- Adds `external_snapshot_on_fail()` — best-effort capture of controller
  logs (current + `--previous`), PurchaseRequest YAML across all namespaces,
  buyer sidecar `/status` (via `kubectl exec ... python3` against the
  litellm container — buyer container is distroless), `cluster-pods.txt`,
  and recent `cluster-events.txt`. All commands wrapped in `|| true` so a
  single failure doesn't abort the bundle. Empty/failed files are removed.

- Calls the snapshot from `external_cleanup` BEFORE any teardown, on the
  failure path only — clean exits keep the existing fast-cleanup behavior.

- Honors `KEEP_CLUSTER_ON_FAIL=1` (default unset) — when set, skips
  `bob stack down` after the snapshot bundle is written and prints the
  preserved stack id + artifact dir + manual cleanup hint.

Unblocks investigation of v1337-style external-seller failures documented
in plans/inference-v1337-buy-report-20260514.md.
---
 flows/buy-external.sh | 94 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

diff --git a/flows/buy-external.sh b/flows/buy-external.sh
index 7b085a88..c69cf49e 100755
--- a/flows/buy-external.sh
+++ b/flows/buy-external.sh
@@ -180,6 +180,84 @@ fi
 PF_LITELLM=""
 PF_LITELLM_LOG=""
 
+# Best-effort diagnostic snapshot, taken on the failure path BEFORE the cluster
+# is torn down. Each command is wrapped in `|| true` so a single failure does
+# not abort the rest of the bundle. The sidecar status snapshot uses the same
+# `kubectl exec ... python3 -c` shape as the in-flow before/after captures
+# (the buyer container is distroless — no curl/wget).
+external_snapshot_on_fail() {
+    type bob >/dev/null 2>&1 || return 0
+    [ -d "$EXTERNAL_BUY_ARTIFACT_DIR" ] || return 0
+
+    local f
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/controller.log"
+    if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 --previous \
+        > "$f" 2>/dev/null; then
+        echo "  snapshot: $f"
+    else
+        if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 \
+            > "$f" 2>/dev/null; then
+            echo "  snapshot: $f (no --previous available)"
+        else
+            rm -f "$f" 2>/dev/null || true
+        fi
+    fi
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/controller-current.log"
+    if bob kubectl logs -n x402 deploy/serviceoffer-controller --tail=2000 \
+        > "$f" 2>/dev/null; then
+        echo "  snapshot: $f"
+    else
+        rm -f "$f" 2>/dev/null || true
+    fi
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/purchaserequest.yaml"
+    if bob kubectl get purchaserequest -A -o yaml > "$f" 2>/dev/null; then
+        echo "  snapshot: $f"
+    else
+        rm -f "$f" 2>/dev/null || true
+    fi
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/buyer-status-after.json"
+    if bob kubectl exec -n llm deployment/litellm -c litellm -- \
+        python3 -c "
+import urllib.request, json
+try:
+    resp = urllib.request.urlopen('http://localhost:8402/status', timeout=5)
+    print(json.dumps(json.loads(resp.read()), indent=2))
+except Exception as e:
+    print(json.dumps({'error': repr(e)}))
+" > "$f" 2>/dev/null; then
+        echo "  snapshot: $f"
+    else
+        rm -f "$f" 2>/dev/null || true
+    fi
+
+    # Re-use the harness-captured buy.py log if it was written; do not re-fetch.
+    if [ -f "$EXTERNAL_BUY_ARTIFACT_DIR/buy-py.log" ]; then
+        f="$EXTERNAL_BUY_ARTIFACT_DIR/agent-pod-buypy.log"
+        if cp "$EXTERNAL_BUY_ARTIFACT_DIR/buy-py.log" "$f" 2>/dev/null; then
+            echo "  snapshot: $f"
+        fi
+    fi
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/cluster-pods.txt"
+    if bob kubectl get pods -A -o wide > "$f" 2>/dev/null; then
+        echo "  snapshot: $f"
+    else
+        rm -f "$f" 2>/dev/null || true
+    fi
+
+    f="$EXTERNAL_BUY_ARTIFACT_DIR/cluster-events.txt"
+    if bob kubectl get events -A --sort-by='.lastTimestamp' 2>/dev/null \
+        | tail -100 > "$f" 2>/dev/null && [ -s "$f" ]; then
+        echo "  snapshot: $f"
+    else
+        rm -f "$f" 2>/dev/null || true
+    fi
+}
+
 external_cleanup() {
     local ec=$?
     set +e
@@ -190,7 +268,21 @@ external_cleanup() {
     # tear it down if the flow already failed — a leaked k3d cluster between
     # runs eats Docker network space (cleanup_k3d_obol_networks reclaims).
     if [ "$ec" -ne 0 ] && type bob >/dev/null 2>&1; then
-        bob stack down >/dev/null 2>&1 || true
+        # Snapshot diagnostics BEFORE the cluster goes away — these are the
+        # only places that record why the PurchaseRequest never advanced.
+        echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR"
+        external_snapshot_on_fail
+
+        if [ "${KEEP_CLUSTER_ON_FAIL:-0}" = "1" ]; then
+            echo ""
+            echo "KEEP_CLUSTER_ON_FAIL=1 → cluster preserved."
+            echo "  Stack id:  $PINNED_STACK_ID"
+            echo "  Artifacts: $EXTERNAL_BUY_ARTIFACT_DIR"
+            echo "  Manual cleanup when done:"
+            echo "    bob stack down"
+        else
+            bob stack down >/dev/null 2>&1 || true
+        fi
     fi
     cleanup_k3d_obol_networks
     set -e

From eb130558374294ed37830621e6754dc7a5622ef0 Mon Sep 17 00:00:00 2001
From: bussyjd <bussyjd@users.noreply.github.com>
Date: Thu, 14 May 2026 12:37:43 +0800
Subject: [PATCH 2/5] fix(flows): pick freshest of .build/obol vs
 .workspace/bin/obol in bootstrap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`bootstrap_flow_workspace` previously copied unconditionally from the
caller-supplied path (always `$OBOL_ROOT/.build/obol`). When iterating on
embedded skill content (e.g. `internal/embed/skills/buy-x402/scripts/buy.py`)
it's easy to rebuild one of the two binaries and forget the other, silently
baking pre-fix files into the cluster PVC via `syncObolSkills`. Burned six
hours during the v1337 live-buy investigation (attempt 5 in
plans/inference-v1337-buy-report-20260514.md).

Now: stat both paths, pick the one with the larger mtime, and emit a 5-line
WARN to stderr when the two differ by more than 5 minutes — header + both
paths-with-mtimes + which one was picked + a one-line rebuild nudge. Cross-
OS stat handled via `stat -c %Y` with `stat -f %m` fallback. Date formatted
with `date -r <file>` (BSD/macOS friendly), GNU `date -u -d "@<epoch>"`
fallback. Contract preserved (no return value, copies into `$dir/bin/obol`).
---
 flows/lib.sh | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/flows/lib.sh b/flows/lib.sh
index d67fe512..036ed940 100755
--- a/flows/lib.sh
+++ b/flows/lib.sh
@@ -517,9 +517,46 @@ bootstrap_flow_workspace() {
     local dir="$1"
     local obol_bin="$2"
     local tool src
+    local workspace_bin="$OBOL_ROOT/.workspace/bin/obol"
+    local picked="$obol_bin"
+    local picked_mtime other_mtime delta abs_delta
+
+    # Pick the freshest of the caller-supplied binary and the workspace binary.
+    # During iteration on embedded skill content (e.g. buy-x402/scripts/buy.py)
+    # it is easy to rebuild one and forget the other; copying the stale one
+    # silently bakes pre-fix files into the cluster PVC. See pitfall in
+    # plans/inference-v1337-buy-report-20260514.md (v1337 attempt 5).
+    if [ -f "$obol_bin" ] && [ -f "$workspace_bin" ] && [ "$obol_bin" != "$workspace_bin" ]; then
+        picked_mtime=$(stat -c %Y "$obol_bin" 2>/dev/null || stat -f %m "$obol_bin" 2>/dev/null || echo 0)
+        other_mtime=$(stat -c %Y "$workspace_bin" 2>/dev/null || stat -f %m "$workspace_bin" 2>/dev/null || echo 0)
+        if [ "$other_mtime" -gt "$picked_mtime" ]; then
+            picked="$workspace_bin"
+            delta=$((picked_mtime - other_mtime))
+        else
+            delta=$((other_mtime - picked_mtime))
+        fi
+        abs_delta=${delta#-}
+        if [ "$abs_delta" -gt 300 ]; then
+            local fmt_a fmt_b picked_fmt
+            fmt_a=$(date -r "$obol_bin" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -d "@$picked_mtime" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "?")
+            fmt_b=$(date -r "$workspace_bin" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -u -d "@$other_mtime" +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo "?")
+            if [ "$picked" = "$workspace_bin" ]; then
+                picked_fmt="$workspace_bin (mtime $fmt_b)"
+            else
+                picked_fmt="$obol_bin (mtime $fmt_a)"
+            fi
+            echo "  WARN: obol binary mtimes differ by ${abs_delta}s — one of these was likely forgotten in a rebuild" >&2
+            echo "    $obol_bin       mtime $fmt_a" >&2
+            echo "    $workspace_bin  mtime $fmt_b" >&2
+            echo "    picked: $picked_fmt" >&2
+            echo "    Rebuild both with \`go build -o .build/obol ./cmd/obol && go build -o .workspace/bin/obol ./cmd/obol\` if you've been iterating on embedded skill content." >&2
+        fi
+    elif [ ! -f "$obol_bin" ] && [ -f "$workspace_bin" ]; then
+        picked="$workspace_bin"
+    fi
 
     reset_flow_workspace "$dir"
-    cp "$obol_bin" "$dir/bin/obol"
+    cp "$picked" "$dir/bin/obol"
     chmod +x "$dir/bin/obol"
     for tool in kubectl helm helmfile k3d k9s openclaw; do
         src=$(command -v "$tool" 2>/dev/null || printf '%s\n' "$OBOL_ROOT/.workspace/bin/$tool")

From 849cd93d751e0d327f56f75d4ea38fa506aab23e Mon Sep 17 00:00:00 2001
From: bussyjd <bussyjd@users.noreply.github.com>
Date: Thu, 14 May 2026 12:37:54 +0800
Subject: [PATCH 3/5] docs(skill): document Cloudflare-WAF UA pitfall + Go-side
 follow-up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds entry #10 to the release-smoke debugging reference covering the
HTTP 403 + Cloudflare error 1010 we hit on v1337 attempts 3–4: managed
WAF rules block the default `Python-urllib/X.Y` UA. Documents the buy.py
fix (commit c2dddc1) plus the unconfirmed-but-likely Go-side follow-up
at internal/serviceoffercontroller/purchase.go:183, where Go's
`http.Client` defaults to `User-Agent: Go-http-client/1.1` and may hit
the same WAF block on the controller probe.
---
 .../obol-stack-dev/references/release-smoke-debugging.md  | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md b/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md
index c27bd913..f08209d9 100644
--- a/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md
+++ b/.agents/skills/obol-stack-dev/references/release-smoke-debugging.md
@@ -99,6 +99,14 @@ Earlier `1.4.9` of `ghcr.io/obolnetwork/x402-facilitator-prometheus-overlay` shi
 - **Fixed upstream**: `ObolNetwork/x402-rs#3` (merged 2026-05-13, `668b7bb`) dropped the platform pin. The publish workflow republished `1.4.9` on push to `main`; arm64 digest is now `sha256:b209345c5e05415df36444b307213c61f9ca08db9f8131d0ebfebefc244ba4ec`.
 - **`X402_FACILITATOR_SKIP_PULL` knob removed** from `flows/lib.sh` once the republished image was validated against the release-smoke. If you encounter `exec format error` on an arm64 host now, the registry image is wrong, not the host — pull-fresh (`docker pull ghcr.io/obolnetwork/x402-facilitator-prometheus-overlay:1.4.9`) and check the manifest with `docker buildx imagetools inspect`.
 
+### 10. Cloudflare WAF blocks default `Python-urllib` User-Agent on external sellers
+
+When buying from external x402 sellers (sellers running outside our k3d cluster — e.g. `https://inference.v1337.org/...`), some sit behind Cloudflare's managed WAF, which **blocks the default `Python-urllib/X.Y` UA with HTTP 403 + Cloudflare error 1010** ("the owner of this website has banned your access based on your browser's signature"). Both the unpaid 402 probe and the paid `X-PAYMENT` request fail; buyers see misleading auth/signing errors instead of the real cause.
+
+- **Symptom**: `buy.py probe` against an external seller fails with 403 (often surfaced as a JSON-decode error or "no accepts" downstream); `buy.py buy` against the same endpoint also fails before signature verification. Curl with default browser UA against the same URL returns 402 cleanly.
+- **Fix in repo**: `c2dddc1` — added module-level `USER_AGENT = os.environ.get("OBOL_BUYER_USER_AGENT", "obol-buy-x402/1.0 (+https://github.com/ObolNetwork/obol-stack)")` to `internal/embed/skills/buy-x402/scripts/buy.py`, applied in `_probe_endpoint` (kind=http), `_probe_endpoint` (kind=inference), and the paid `X-PAYMENT` request in `buy_paid_oneshot`. Tested four UAs against v1337 (`curl/*`, generic `Mozilla/*`, `Chrome/*`, custom `obol-buy-x402/*`) — all four returned 402 cleanly. The fix is "send anything that isn't `Python-urllib`", not "send a specific browser UA". Operator override: `OBOL_BUYER_USER_AGENT`.
+- **Follow-up (not yet confirmed)**: the same WAF block likely affects the Go-side controller probe at `internal/serviceoffercontroller/purchase.go:183`, since Go's `http.Client` defaults to `User-Agent: Go-http-client/1.1`. Verify against v1337 and apply the same UA override on the Go side if reproduced.
+
 ## Diagnostic Patterns
 
 - **Don't confuse 503 with "verifier broken"** — almost always one of #1, #2, #5, #6, or a missing CA bundle (`paid-flows.md`).

From 82108c3af1faf0d97f8e061aa24265c3ff8a2dfb Mon Sep 17 00:00:00 2001
From: bussyjd <bussyjd@users.noreply.github.com>
Date: Thu, 14 May 2026 15:05:41 +0800
Subject: [PATCH 4/5] =?UTF-8?q?docs(plans):=20retract=20v1337=20controller?=
 =?UTF-8?q?-gap=20hypothesis=20=E2=80=94=20controller=20works?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-ran the v1337 buy with the new KEEP_CLUSTER_ON_FAIL=1 knob (commit
b749f95). The controller reconciled the PurchaseRequest in 55 seconds
through Probed → AuthsLoaded → Configured → Ready, against the same
external endpoint the original report failed on.

The original report's central technical claim — "serviceoffer-controller
does not reconcile PurchaseRequests for external sellers" — is false.
The controller is endpoint-agnostic by design (verified by code review of
internal/serviceoffercontroller/purchase.go). Attempt 5's reconcile-hang
was almost certainly a kubectl-exec session SIGKILL (exit 137), not a
controller bug — likely harness-side run_with_timeout firing while
buy.py was still polling normally.

Today's run did surface a real but unrelated quirk: LiteLLM's POST
/model/new fails with EROFS because /etc/litellm/config.yaml is mounted
read-only as a Kubernetes ConfigMap volume; the controller catches this
and falls back to ConfigMap reload, which works fine. Pre-existing,
worth one line in paid-flows.md so the next debugger isn't startled.

Step 18 (paid request) failed for an operator-error reason: I picked
qwen3.6-27b as the upstream model id, but v1337's vLLM serves under a
different name. Bob's 0.023 OBOL was NOT consumed (LiteLLM 404'd before
the buyer sidecar could settle).

Companion to plans/inference-v1337-buy-report-20260514.md. Retracts
follow-up #1 of that report.
---
 plans/inference-v1337-followup-20260514.md | 102 +++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 plans/inference-v1337-followup-20260514.md

diff --git a/plans/inference-v1337-followup-20260514.md b/plans/inference-v1337-followup-20260514.md
new file mode 100644
index 00000000..9c95425b
--- /dev/null
+++ b/plans/inference-v1337-followup-20260514.md
@@ -0,0 +1,102 @@
+# Live buy from `inference.v1337.org` — follow-up findings
+
+Date: 2026-05-14
+Test bed: spark1 (Linux aarch64)
+Worktree: `/home/claude/obol-stack-qa-20260513-135712-post490`
+Branch HEAD during the test: `eb13055` (`chore/buy-external-followups`)
+Companion to: `plans/inference-v1337-buy-report-20260514.md`
+
+## TL;DR
+
+Re-ran the v1337 buy with `KEEP_CLUSTER_ON_FAIL=1` (new knob in `flows/buy-external.sh`, commit `b749f95`). Steps 1–17 PASS. The controller reconciled the PurchaseRequest in 55 seconds (`Probed` → `AuthsLoaded` → `Configured` → `Ready`), publishing `paid/qwen3.6-27b` via the buyer sidecar. **The "controller external-seller mode gap" hypothesis from the original report is false** — the controller is endpoint-agnostic by design (confirmed by code review of `internal/serviceoffercontroller/purchase.go`) and works against arbitrary external x402 sellers without modification.
+
+The original report's attempt 5 failure (`command terminated with exit code 137` at the controller-reconcile wait) was almost certainly a kubectl-exec session SIGKILL — not a controller hang. Today's identical code path completed cleanly in 0s at the same step.
+
+## What today's run actually showed
+
+PR `status.conditions[]` from the captured `purchaserequest.yaml`:
+
+```yaml
+conditions:
+- type: Probed         status: "True"   reason: Validated   message: "402: 23000000000000000 on eip155:84532"   06:37:06Z
+- type: AuthsLoaded    status: "True"   reason: Loaded      message: "Loaded 1 pre-signed auths from spec"      06:37:06Z
+- type: Configured     status: "True"   reason: Written     message: "Wrote 1 auths to llm/x402-buyer-auths"    06:37:06Z
+- type: Ready          status: "True"   reason: Reconciled  message: "Sidecar: 1 remaining, 0 spent"            06:38:01Z
+observedGeneration: 1
+publicModel: paid/qwen3.6-27b
+remaining: 1
+totalSigned: 1
+signerAddress: 0x57b0eF875DeB5A37301F1640E469a2129Da9490E
+```
+
+`buyer-status-after.json`:
+
+```json
+{"v1337-aeon": {"url": "https://inference.v1337.org/services/aeon", "remote_model": "qwen3.6-27b", "public_model": "paid/qwen3.6-27b", "remaining": 1, "spent": 0, "network": "base-sepolia"}}
+```
+
+The Go-side probe at `purchase.go:183` was NOT WAF-blocked. The follow-up worry (entry #10 in `release-smoke-debugging.md`) about Cloudflare's WAF blocking `Go-http-client/1.1` UA does not reproduce against v1337. Worth keeping the doc note for the general class of WAF UA filters, but not a load-bearing concern for the controller code path.
+
+## Why attempt 5 looked like a controller hang
+
+Today the harness completed step 14 (`buy.py completed`) with the same code that produced attempt 5's exit-code-137. Two structural differences plausibly explain why attempt 5 hung where today's run sailed:
+
+1. **`bootstrap_flow_workspace` now picks the freshest binary** (commit `eb13055`). Attempt 5 silently used a stale `.build/obol` whose embedded buy.py lacked the USER_AGENT fix. Even after the operator rebuilt `.build/obol` mid-attempt, the Bob workspace had already been bootstrapped from the older copy. The PVC's buy.py wrote a probe with `Python-urllib` UA → 403 from CF → the controller's view of the world differed from the buyer's view in subtle ways. The `eb13055` fix removes that footgun for future runs.
+
+2. **The kubectl-exec SIGKILL was an environmental artifact.** `command terminated with exit code 137` is what kubectl prints when its remote process dies from SIGKILL — could be harness `run_with_timeout`, OOM, or control-plane jitter. None of those would be visible in the controller logs (which today's `KEEP_CLUSTER_ON_FAIL=1` snapshot proved go quiet during the wait). Today's harness completed the same `obol kubectl exec` to buy.py without issue, so the SIGKILL was not deterministic.
+
+## The actual blocker today
+
+Step 18 (paid request through LiteLLM) failed:
+
+```
+FAIL: [18] Paid request returned HTTP 404
+{"error":{"message":"litellm.NotFoundError: NotFoundError: OpenAIException - The model `qwen3.6-27b` does not exist.. Received Model Group=paid/qwen3.6-27b\nAvailable Model Group Fallbacks=None","type":null,"param":null,"code":"404"}}
+```
+
+This is operator-error model-name mismatch:
+
+- LiteLLM correctly routed `paid/qwen3.6-27b` → buyer sidecar → `https://inference.v1337.org/services/aeon`.
+- v1337's upstream vLLM does not serve a model named `qwen3.6-27b`.
+- The actual model name is unknown from `/.well-known/agent-registration.json` (which advertises display name "Qwen3.6-27B AEON Ultimate" and skills `llm/inference, llm/uncensored`, but no model id).
+
+Bob's 0.023 OBOL pre-signed auth was **NOT consumed** — LiteLLM 404'd before reaching the buyer sidecar's `/settle` path. Wallet balance unchanged.
+
+To finish the live buy proof, the harness needs the right `--model` value. Options: (a) ask the seller, (b) probe `/v1/models` if v1337 makes it free, (c) brute-force common variants (`aeon`, `qwen-3.6-27b`, `qwen3.6`, `qwen3.6-27b-aeon`). All low-priority — the controller-side answer is already in.
+
+## Side finding: LiteLLM hot-add quirk
+
+The controller logs surfaced:
+
+```
+purchase: hot-add paid/qwen3.6-27b failed: POST /model/new: 400 Bad Request:
+{"error":{"message":"Authentication Error, [Errno 30] Read-only file system: '/etc/litellm/config.yaml'", ...}}; relying on ConfigMap reload
+```
+
+LiteLLM's `/model/new` API tries to write back to `/etc/litellm/config.yaml`. In our deployment that path is a Kubernetes ConfigMap volume — read-only by default. The controller catches the 400 and falls back to the ConfigMap-reload path, which works (the alias DID become available, otherwise step 17 wouldn't have passed). Pre-existing behavior, not external-seller specific. Worth a one-line note in `paid-flows.md` so the next debugger isn't startled by the WARN in controller logs.
+
+## Updates to original report
+
+Replace follow-up #1 ("serviceoffer-controller external-seller mode") with: "RESOLVED — controller is endpoint-agnostic by design. Attempt 5's reconcile-hang was a kubectl-exec SIGKILL artifact, not a controller bug. Verified 2026-05-14 with `KEEP_CLUSTER_ON_FAIL=1` re-run."
+
+Follow-up #2 (harness binary path) — DONE in commit `eb13055`.
+Follow-up #3 (CF-WAF UA documentation) — DONE in commit `849cd93`.
+Follow-up #4 (`KEEP_CLUSTER_ON_FAIL` knob) — DONE in commit `b749f95`.
+
+The original report still has narrative value for the four bug fixes it surfaced (k3d cluster-name cap, CAIP-2 chain id mismatch, CF-WAF Python-urllib UA, stale `.build/obol`). Only the controller hypothesis was wrong.
+
+## Artifacts
+
+Under `/home/claude/obol-stack-qa-20260513-135712-post490/.tmp/v1337-rerun-20260514-063232-artifacts/` on spark1, captured by the new `external_snapshot_on_fail()`:
+
+- `controller.log`, `controller-current.log` — full reconcile trace
+- `purchaserequest.yaml` — the conclusive `Ready=True` proof
+- `buyer-status-after.json` — sidecar saw 1 remaining, 0 spent
+- `agent-pod-buypy.log` — clean `buy.py` run through PR creation
+- `cluster-pods.txt`, `cluster-events.txt` — full cluster state at FAIL
+
+The Bob k3d cluster (`obol-stack-buy-ext-bob`) is preserved on spark1 pending teardown.
+
+## Closing note
+
+The phase-1 polish items in `chore/buy-external-followups` more than paid for themselves on the first re-run: `KEEP_CLUSTER_ON_FAIL=1` made the diagnosis trivial, the binary normalization removed one of the candidate causes for attempt 5's hang, and the diagnostic snapshot bundle gave us seven artifacts that took a single bash command to inspect. The original v1337 report would have been wrong on its central technical claim if we hadn't re-run with these in place — a useful argument for keeping operator-level diagnostic ergonomics ahead of feature work.

From df5fcffc31aeabe3e7505accd28413aede2b62f7 Mon Sep 17 00:00:00 2001
From: bussyjd <bussyjd@users.noreply.github.com>
Date: Fri, 15 May 2026 09:41:30 +0800
Subject: [PATCH 5/5] refactor(buy-external): green-only cleanup gate, drop
 KEEP_CLUSTER_ON_FAIL knob
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the opt-in KEEP_CLUSTER_ON_FAIL=1 env knob (added in b749f95)
with an unconditional rule: cleanup happens iff every step passes. On
FAIL, snapshot the diagnostic bundle and preserve the cluster — every
time, no env override needed.

Also inverts the prior success-side default. The previous design left
the cluster up on success "so the operator can poke around"; in
practice operators re-ran the harness from scratch when they wanted
fresh state, and the leftover cluster mostly leaked across runs. With
the new gate, a green run leaves a clean machine.

Net behavior:
- success → bob stack down (clean state for next run)
- failure → snapshot + preserve (operator pays one manual teardown
            when done diagnosing)

The diagnostic snapshot helper from b749f95 is unchanged; only the
preservation gate moved from an env knob to the implicit pass/fail
state.
---
 flows/buy-external.sh | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/flows/buy-external.sh b/flows/buy-external.sh
index c69cf49e..002d0d2b 100755
--- a/flows/buy-external.sh
+++ b/flows/buy-external.sh
@@ -264,24 +264,23 @@ external_cleanup() {
     [ -n "$PF_LITELLM" ] && cleanup_pid "$PF_LITELLM" 2>/dev/null
     [ -n "$PF_LITELLM_LOG" ] && rm -f "$PF_LITELLM_LOG" 2>/dev/null
 
-    # Leave the cluster up on success so the operator can poke around. Only
-    # tear it down if the flow already failed — a leaked k3d cluster between
-    # runs eats Docker network space (cleanup_k3d_obol_networks reclaims).
-    if [ "$ec" -ne 0 ] && type bob >/dev/null 2>&1; then
-        # Snapshot diagnostics BEFORE the cluster goes away — these are the
-        # only places that record why the PurchaseRequest never advanced.
-        echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR"
-        external_snapshot_on_fail
-
-        if [ "${KEEP_CLUSTER_ON_FAIL:-0}" = "1" ]; then
+    # Cleanup gate: tear down only when every step passed. On FAIL, snapshot
+    # diagnostics and preserve the cluster — the only places that record why
+    # a PurchaseRequest never advanced are the controller logs, PR
+    # status.conditions[], and sidecar /status, all of which die with the
+    # cluster. Operator pays one manual `bob stack down` when done diagnosing.
+    if type bob >/dev/null 2>&1; then
+        if [ "$ec" -eq 0 ]; then
+            bob stack down >/dev/null 2>&1 || true
+        else
+            echo "Capturing failure snapshot to $EXTERNAL_BUY_ARTIFACT_DIR"
+            external_snapshot_on_fail
             echo ""
-            echo "KEEP_CLUSTER_ON_FAIL=1 → cluster preserved."
+            echo "FAIL → cluster preserved for diagnosis."
             echo "  Stack id:  $PINNED_STACK_ID"
             echo "  Artifacts: $EXTERNAL_BUY_ARTIFACT_DIR"
             echo "  Manual cleanup when done:"
             echo "    bob stack down"
-        else
-            bob stack down >/dev/null 2>&1 || true
         fi
     fi
     cleanup_k3d_obol_networks