outsourc-e · GO1984 · Jun 1, 2026 · Jun 1, 2026
diff --git a/README.md b/README.md
@@ -148,6 +148,17 @@ benchloop run --model your-model --provider openai_compat --endpoint http://your
 
 The CLI flag takes precedence over the env var. For Ollama and local providers without auth, neither is needed.
 
+When you use multiple OpenAI-compatible endpoints, set per-endpoint keys with
+`BENCHLOOP_OPENAI_KEYS`. Entries are comma-separated `endpoint=key` pairs and
+the endpoint must match the base URL without a trailing slash:
+
+```bash
+export BENCHLOOP_OPENAI_KEYS="http://127.0.0.1:8000=sk-local,https://openrouter.ai/api=sk-or-..."
+```
+
+BenchLoop uses the matching endpoint-specific key first, then falls back to
+`OPENAI_API_KEY`.
+
 ### Launch the local dashboard
 
 v0.2.0+ ships the full FastAPI + React dashboard inside the wheel. After `pipx install benchloop-cli`:

diff --git a/bench_loop/dashboard/api/routes/benchmark.py b/bench_loop/dashboard/api/routes/benchmark.py
@@ -352,7 +352,9 @@ async def get_run(run_id: str):
     """Get detailed run result."""
     # Check active runs first
     if run_id in _active_runs:
-        return _active_runs[run_id]
+        state = dict(_active_runs[run_id])
+        state.pop("task", None)
+        return state
 
     # Check disk
     run_dir = RUNS_DIR / run_id

diff --git a/bench_loop/dashboard/api/routes/chat.py b/bench_loop/dashboard/api/routes/chat.py
@@ -8,6 +8,8 @@
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
+from bench_loop.providers.openai_compat import _auth_headers as _openai_headers
+
 router = APIRouter()
 
 
@@ -61,6 +63,7 @@ async def chat_generate(req: ChatRequest):
                 resp = await client.post(
                     f"{req.endpoint.rstrip('/')}/v1/chat/completions",
                     json=payload,
+                    headers=_openai_headers(req.endpoint),
                 )
                 resp.raise_for_status()
                 data = resp.json()

diff --git a/bench_loop/dashboard/api/routes/models.py b/bench_loop/dashboard/api/routes/models.py
@@ -11,6 +11,8 @@
 from pydantic import BaseModel
 import httpx
 
+from bench_loop.providers.openai_compat import _auth_headers as _openai_headers
+
 try:
     from sse_starlette.sse import EventSourceResponse
 except Exception:  # pragma: no cover
@@ -195,7 +197,7 @@ async def _fetch_ollama_models(endpoint: str) -> list[dict]:
 async def _fetch_openai_models(endpoint: str) -> list[dict]:
     """Fetch models from an OpenAI-compatible endpoint."""
     async with httpx.AsyncClient(timeout=5) as client:
-        resp = await client.get(f"{endpoint.rstrip('/')}/v1/models")
+        resp = await client.get(f"{endpoint.rstrip('/')}/v1/models", headers=_openai_headers(endpoint))
         resp.raise_for_status()
     raw = resp.json().get("data", [])
     return [
@@ -232,20 +234,27 @@ async def _probe_provider(provider: dict) -> dict | None:
         return None
 
 
-_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081}
+_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081, 8088, 10531, 11451}
+_OPENAI_HINT_HOSTS = {"api.openai.com", "openrouter.ai"}
+
+
+def _is_openai_compat_endpoint(endpoint: str) -> bool:
+    from urllib.parse import urlparse
+
+    try:
+        parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
+    except Exception:
+        return False
+    hostname = (parsed.hostname or "").lower()
+    return parsed.port in _OPENAI_HINT_PORTS or hostname in _OPENAI_HINT_HOSTS
 
 
 @router.get("/models")
 async def list_models(endpoint: str = Query(default="")):
     """List models. If endpoint specified, query that. Otherwise auto-detect local providers."""
     if endpoint:
         # If the port is well-known for OpenAI-compatible servers, try that first.
-        from urllib.parse import urlparse
-        try:
-            port = urlparse(endpoint).port
-        except Exception:
-            port = None
-        order: list[str] = ["openai", "ollama"] if port in _OPENAI_HINT_PORTS else ["ollama", "openai"]
+        order: list[str] = ["openai", "ollama"] if _is_openai_compat_endpoint(endpoint) else ["ollama", "openai"]
         last_error: str | None = None
         for kind in order:
             try:
@@ -307,39 +316,68 @@ async def preflight_model(
     """
     ep = endpoint.rstrip("/")
 
-    # Version check against our support table (fast path, no model load needed)
-    version = await _fetch_ollama_version(ep)
-    installed_tuple = _version_tuple(version) if version else (0,)
-    # We don't have the details dict for an arbitrary model name — do a best
-    # effort classification from the name alone.
-    support = _classify_model_support(model, {"family": "", "quantization_level": ""})
-    required = support["required_version"]
-    if required and installed_tuple < _version_tuple(required):
+    is_openai_compat = _is_openai_compat_endpoint(ep)
+
+    # Ollama version checks only apply to Ollama endpoints. OpenAI-compatible
+    # servers such as llama.cpp/vLLM do not expose /api/version and should not
+    # be blocked by Ollama architecture/version rules.
+    version = None
+    if not is_openai_compat:
+        version = await _fetch_ollama_version(ep)
+        installed_tuple = _version_tuple(version) if version else (0,)
+        # We don't have the details dict for an arbitrary model name — do a best
+        # effort classification from the name alone.
+        support = _classify_model_support(model, {"family": "", "quantization_level": ""})
+        required = support["required_version"]
+        if required and installed_tuple < _version_tuple(required):
+            return {
+                "ok": False,
+                "reason": "version_mismatch",
+                "message": (
+                    f"Model `{model}` needs Ollama {required}+ ({support['reason']}). "
+                    f"Installed: {version or 'unknown'}. Upgrade with "
+                    f"`brew upgrade ollama` then restart `ollama serve`."
+                ),
+                "required_version": required,
+                "provider_version": version,
+                "raw": None,
+            }
+
+    # Actual load test: minimum-cost chat round-trip
+    try:
+        async with httpx.AsyncClient(timeout=15) as client:
+            if is_openai_compat:
+                resp = await client.post(
+                    f"{ep}/v1/chat/completions",
+                    json={
+                        "model": model,
+                        "messages": [{"role": "user", "content": "ok"}],
+                        "stream": False,
+                        "max_tokens": 16,
+                    },
+                    headers=_openai_headers(ep),
+                )
+            else:
+                resp = await client.post(
+                    f"{ep}/api/chat",
+                    json={
+                        "model": model,
+                        "messages": [{"role": "user", "content": "ok"}],
+                        "stream": False,
+                        "options": {"num_predict": 1},
+                    },
+                )
+    except httpx.ReadTimeout as exc:
         return {
-            "ok": False,
-            "reason": "version_mismatch",
+            "ok": True,
+            "reason": "load_timeout",
             "message": (
-                f"Model `{model}` needs Ollama {required}+ ({support['reason']}). "
-                f"Installed: {version or 'unknown'}. Upgrade with "
-                f"`brew upgrade ollama` then restart `ollama serve`."
+                f"Model `{model}` did not answer within the quick preflight window. "
+                "Continuing with the benchmark so large models can finish loading normally."
             ),
-            "required_version": required,
+            "raw": str(exc),
             "provider_version": version,
-            "raw": None,
         }
-
-    # Actual load test: minimum-cost chat round-trip
-    try:
-        async with httpx.AsyncClient(timeout=15) as client:
-            resp = await client.post(
-                f"{ep}/api/chat",
-                json={
-                    "model": model,
-                    "messages": [{"role": "user", "content": "ok"}],
-                    "stream": False,
-                    "options": {"num_predict": 1},
-                },
-            )
     except httpx.HTTPError as exc:
         return {
             "ok": False,
@@ -386,6 +424,14 @@ async def preflight_model(
         }
 
     if "model" in lower and "not found" in lower:
+        if is_openai_compat:
+            return {
+                "ok": False,
+                "reason": "not_found",
+                "message": f"`{model}` is not available at {ep}. Check the model id from `/v1/models`.",
+                "raw": raw,
+                "provider_version": version,
+            }
         return {
             "ok": False,
             "reason": "not_found",

diff --git a/bench_loop/hardware.py b/bench_loop/hardware.py
@@ -92,6 +92,16 @@ def _env_hardware_overrides() -> dict[str, object]:
     return overrides
 
 
+def _parse_nvidia_float(value: str) -> float | None:
+    cleaned = value.strip()
+    if not cleaned or cleaned.upper() == "[N/A]":
+        return None
+    try:
+        return float(cleaned)
+    except ValueError:
+        return None
+
+
 def _detect_gpu() -> dict[str, object]:
     overrides = _env_hardware_overrides()
     if overrides.get("gpu") or overrides.get("hardware_label"):
@@ -133,8 +143,9 @@ def _detect_gpu() -> dict[str, object]:
         if len(parts) < 3:
             continue
         name, memory_mb, temp_c = parts[0], parts[1], parts[2]
-        memory_gb = float(memory_mb) / 1024 if memory_mb else 0.0
-        temperature = float(temp_c) if temp_c else None
+        memory_value = _parse_nvidia_float(memory_mb)
+        temperature = _parse_nvidia_float(temp_c)
+        memory_gb = memory_value / 1024 if memory_value is not None else 0.0
         details.append({"name": name, "memory_gb": memory_gb, "temperature_c": temperature})
         names.append(name)
         total_memory_gb += memory_gb

diff --git a/bench_loop/providers/openai_compat.py b/bench_loop/providers/openai_compat.py
@@ -15,9 +15,20 @@
 _STREAM_TIMEOUT = httpx.Timeout(connect=15.0, read=600.0, write=60.0, pool=60.0)
 
 
-def _auth_headers() -> dict[str, str]:
+def _api_key_for_endpoint(endpoint: str) -> str:
+    ep = endpoint.rstrip("/")
+    raw = os.getenv("BENCHLOOP_OPENAI_KEYS", "").strip()
+    if raw:
+        for chunk in raw.split(","):
+            url, sep, key = chunk.partition("=")
+            if sep and url.strip().rstrip("/") == ep:
+                return key.strip()
+    return os.getenv("OPENAI_API_KEY", "").strip()
+
+
+def _auth_headers(endpoint: str) -> dict[str, str]:
     headers = {"Content-Type": "application/json"}
-    api_key = os.getenv("OPENAI_API_KEY", "")
+    api_key = _api_key_for_endpoint(endpoint)
     if api_key:
         headers["Authorization"] = f"Bearer {api_key}"
     return headers
@@ -27,7 +38,7 @@ async def list_models(endpoint: str) -> list[str]:
     base_url = endpoint.rstrip("/")
     try:
         async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client:
-            response = await client.get(f"{base_url}/v1/models", headers=_auth_headers())
+            response = await client.get(f"{base_url}/v1/models", headers=_auth_headers(endpoint))
             response.raise_for_status()
     except Exception:
         return []
@@ -63,7 +74,7 @@ async def chat(endpoint: str, model: str, messages: list[dict[str, Any]], **kwar
             response = await client.post(
                 f"{base_url}/v1/chat/completions",
                 json=payload,
-                headers=_auth_headers(),
+                headers=_auth_headers(endpoint),
             )
             response.raise_for_status()
     except Exception as exc:
@@ -168,7 +179,7 @@ async def chat_streaming(
                 "POST",
                 f"{base_url}/v1/chat/completions",
                 json=payload,
-                headers=_auth_headers(),
+                headers=_auth_headers(endpoint),
             ) as response:
                 response.raise_for_status()
                 async for line in response.aiter_lines():