diff --git a/README.md b/README.md index f9cdc39..c1826e7 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,17 @@ benchloop run --model your-model --provider openai_compat --endpoint http://your The CLI flag takes precedence over the env var. For Ollama and local providers without auth, neither is needed. +When you use multiple OpenAI-compatible endpoints, set per-endpoint keys with +`BENCHLOOP_OPENAI_KEYS`. Entries are comma-separated `endpoint=key` pairs and +the endpoint must match the base URL without a trailing slash: + +```bash +export BENCHLOOP_OPENAI_KEYS="http://127.0.0.1:8000=sk-local,https://openrouter.ai/api=sk-or-..." +``` + +BenchLoop uses the matching endpoint-specific key first, then falls back to +`OPENAI_API_KEY`. + ### Launch the local dashboard v0.2.0+ ships the full FastAPI + React dashboard inside the wheel. After `pipx install benchloop-cli`: diff --git a/bench_loop/dashboard/api/routes/benchmark.py b/bench_loop/dashboard/api/routes/benchmark.py index 28385db..95c270e 100644 --- a/bench_loop/dashboard/api/routes/benchmark.py +++ b/bench_loop/dashboard/api/routes/benchmark.py @@ -352,7 +352,9 @@ async def get_run(run_id: str): """Get detailed run result.""" # Check active runs first if run_id in _active_runs: - return _active_runs[run_id] + state = dict(_active_runs[run_id]) + state.pop("task", None) + return state # Check disk run_dir = RUNS_DIR / run_id diff --git a/bench_loop/dashboard/api/routes/chat.py b/bench_loop/dashboard/api/routes/chat.py index cb8645e..afa53dd 100644 --- a/bench_loop/dashboard/api/routes/chat.py +++ b/bench_loop/dashboard/api/routes/chat.py @@ -8,6 +8,8 @@ from fastapi import APIRouter, HTTPException from pydantic import BaseModel +from bench_loop.providers.openai_compat import _auth_headers as _openai_headers + router = APIRouter() @@ -61,6 +63,7 @@ async def chat_generate(req: ChatRequest): resp = await client.post( f"{req.endpoint.rstrip('/')}/v1/chat/completions", json=payload, + headers=_openai_headers(req.endpoint), ) resp.raise_for_status() data = resp.json() diff --git a/bench_loop/dashboard/api/routes/models.py b/bench_loop/dashboard/api/routes/models.py index ab32081..e09a6d2 100644 --- a/bench_loop/dashboard/api/routes/models.py +++ b/bench_loop/dashboard/api/routes/models.py @@ -11,6 +11,8 @@ from pydantic import BaseModel import httpx +from bench_loop.providers.openai_compat import _auth_headers as _openai_headers + try: from sse_starlette.sse import EventSourceResponse except Exception: # pragma: no cover @@ -195,7 +197,7 @@ async def _fetch_ollama_models(endpoint: str) -> list[dict]: async def _fetch_openai_models(endpoint: str) -> list[dict]: """Fetch models from an OpenAI-compatible endpoint.""" async with httpx.AsyncClient(timeout=5) as client: - resp = await client.get(f"{endpoint.rstrip('/')}/v1/models") + resp = await client.get(f"{endpoint.rstrip('/')}/v1/models", headers=_openai_headers(endpoint)) resp.raise_for_status() raw = resp.json().get("data", []) return [ @@ -232,7 +234,19 @@ async def _probe_provider(provider: dict) -> dict | None: return None -_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081} +_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081, 8088, 10531, 11451} +_OPENAI_HINT_HOSTS = {"api.openai.com", "openrouter.ai"} + + +def _is_openai_compat_endpoint(endpoint: str) -> bool: + from urllib.parse import urlparse + + try: + parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}") + except Exception: + return False + hostname = (parsed.hostname or "").lower() + return parsed.port in _OPENAI_HINT_PORTS or hostname in _OPENAI_HINT_HOSTS @router.get("/models") @@ -240,12 +254,7 @@ async def list_models(endpoint: str = Query(default="")): """List models. If endpoint specified, query that. Otherwise auto-detect local providers.""" if endpoint: # If the port is well-known for OpenAI-compatible servers, try that first. - from urllib.parse import urlparse - try: - port = urlparse(endpoint).port - except Exception: - port = None - order: list[str] = ["openai", "ollama"] if port in _OPENAI_HINT_PORTS else ["ollama", "openai"] + order: list[str] = ["openai", "ollama"] if _is_openai_compat_endpoint(endpoint) else ["ollama", "openai"] last_error: str | None = None for kind in order: try: @@ -307,39 +316,68 @@ async def preflight_model( """ ep = endpoint.rstrip("/") - # Version check against our support table (fast path, no model load needed) - version = await _fetch_ollama_version(ep) - installed_tuple = _version_tuple(version) if version else (0,) - # We don't have the details dict for an arbitrary model name — do a best - # effort classification from the name alone. - support = _classify_model_support(model, {"family": "", "quantization_level": ""}) - required = support["required_version"] - if required and installed_tuple < _version_tuple(required): + is_openai_compat = _is_openai_compat_endpoint(ep) + + # Ollama version checks only apply to Ollama endpoints. OpenAI-compatible + # servers such as llama.cpp/vLLM do not expose /api/version and should not + # be blocked by Ollama architecture/version rules. + version = None + if not is_openai_compat: + version = await _fetch_ollama_version(ep) + installed_tuple = _version_tuple(version) if version else (0,) + # We don't have the details dict for an arbitrary model name — do a best + # effort classification from the name alone. + support = _classify_model_support(model, {"family": "", "quantization_level": ""}) + required = support["required_version"] + if required and installed_tuple < _version_tuple(required): + return { + "ok": False, + "reason": "version_mismatch", + "message": ( + f"Model `{model}` needs Ollama {required}+ ({support['reason']}). " + f"Installed: {version or 'unknown'}. Upgrade with " + f"`brew upgrade ollama` then restart `ollama serve`." + ), + "required_version": required, + "provider_version": version, + "raw": None, + } + + # Actual load test: minimum-cost chat round-trip + try: + async with httpx.AsyncClient(timeout=15) as client: + if is_openai_compat: + resp = await client.post( + f"{ep}/v1/chat/completions", + json={ + "model": model, + "messages": [{"role": "user", "content": "ok"}], + "stream": False, + "max_tokens": 16, + }, + headers=_openai_headers(ep), + ) + else: + resp = await client.post( + f"{ep}/api/chat", + json={ + "model": model, + "messages": [{"role": "user", "content": "ok"}], + "stream": False, + "options": {"num_predict": 1}, + }, + ) + except httpx.ReadTimeout as exc: return { - "ok": False, - "reason": "version_mismatch", + "ok": True, + "reason": "load_timeout", "message": ( - f"Model `{model}` needs Ollama {required}+ ({support['reason']}). " - f"Installed: {version or 'unknown'}. Upgrade with " - f"`brew upgrade ollama` then restart `ollama serve`." + f"Model `{model}` did not answer within the quick preflight window. " + "Continuing with the benchmark so large models can finish loading normally." ), - "required_version": required, + "raw": str(exc), "provider_version": version, - "raw": None, } - - # Actual load test: minimum-cost chat round-trip - try: - async with httpx.AsyncClient(timeout=15) as client: - resp = await client.post( - f"{ep}/api/chat", - json={ - "model": model, - "messages": [{"role": "user", "content": "ok"}], - "stream": False, - "options": {"num_predict": 1}, - }, - ) except httpx.HTTPError as exc: return { "ok": False, @@ -386,6 +424,14 @@ async def preflight_model( } if "model" in lower and "not found" in lower: + if is_openai_compat: + return { + "ok": False, + "reason": "not_found", + "message": f"`{model}` is not available at {ep}. Check the model id from `/v1/models`.", + "raw": raw, + "provider_version": version, + } return { "ok": False, "reason": "not_found", diff --git a/bench_loop/hardware.py b/bench_loop/hardware.py index 26c05ec..e308e4f 100644 --- a/bench_loop/hardware.py +++ b/bench_loop/hardware.py @@ -92,6 +92,16 @@ def _env_hardware_overrides() -> dict[str, object]: return overrides +def _parse_nvidia_float(value: str) -> float | None: + cleaned = value.strip() + if not cleaned or cleaned.upper() == "[N/A]": + return None + try: + return float(cleaned) + except ValueError: + return None + + def _detect_gpu() -> dict[str, object]: overrides = _env_hardware_overrides() if overrides.get("gpu") or overrides.get("hardware_label"): @@ -133,8 +143,9 @@ def _detect_gpu() -> dict[str, object]: if len(parts) < 3: continue name, memory_mb, temp_c = parts[0], parts[1], parts[2] - memory_gb = float(memory_mb) / 1024 if memory_mb else 0.0 - temperature = float(temp_c) if temp_c else None + memory_value = _parse_nvidia_float(memory_mb) + temperature = _parse_nvidia_float(temp_c) + memory_gb = memory_value / 1024 if memory_value is not None else 0.0 details.append({"name": name, "memory_gb": memory_gb, "temperature_c": temperature}) names.append(name) total_memory_gb += memory_gb diff --git a/bench_loop/providers/openai_compat.py b/bench_loop/providers/openai_compat.py index 1fab53c..edab9b6 100644 --- a/bench_loop/providers/openai_compat.py +++ b/bench_loop/providers/openai_compat.py @@ -15,9 +15,20 @@ _STREAM_TIMEOUT = httpx.Timeout(connect=15.0, read=600.0, write=60.0, pool=60.0) -def _auth_headers() -> dict[str, str]: +def _api_key_for_endpoint(endpoint: str) -> str: + ep = endpoint.rstrip("/") + raw = os.getenv("BENCHLOOP_OPENAI_KEYS", "").strip() + if raw: + for chunk in raw.split(","): + url, sep, key = chunk.partition("=") + if sep and url.strip().rstrip("/") == ep: + return key.strip() + return os.getenv("OPENAI_API_KEY", "").strip() + + +def _auth_headers(endpoint: str) -> dict[str, str]: headers = {"Content-Type": "application/json"} - api_key = os.getenv("OPENAI_API_KEY", "") + api_key = _api_key_for_endpoint(endpoint) if api_key: headers["Authorization"] = f"Bearer {api_key}" return headers @@ -27,7 +38,7 @@ async def list_models(endpoint: str) -> list[str]: base_url = endpoint.rstrip("/") try: async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client: - response = await client.get(f"{base_url}/v1/models", headers=_auth_headers()) + response = await client.get(f"{base_url}/v1/models", headers=_auth_headers(endpoint)) response.raise_for_status() except Exception: return [] @@ -63,7 +74,7 @@ async def chat(endpoint: str, model: str, messages: list[dict[str, Any]], **kwar response = await client.post( f"{base_url}/v1/chat/completions", json=payload, - headers=_auth_headers(), + headers=_auth_headers(endpoint), ) response.raise_for_status() except Exception as exc: @@ -168,7 +179,7 @@ async def chat_streaming( "POST", f"{base_url}/v1/chat/completions", json=payload, - headers=_auth_headers(), + headers=_auth_headers(endpoint), ) as response: response.raise_for_status() async for line in response.aiter_lines(): diff --git a/tests/test_openai_compat_endpoints.py b/tests/test_openai_compat_endpoints.py new file mode 100644 index 0000000..29bb768 --- /dev/null +++ b/tests/test_openai_compat_endpoints.py @@ -0,0 +1,111 @@ +from __future__ import annotations + +import pytest + +from bench_loop.dashboard.api.routes import chat as chat_routes +from bench_loop.dashboard.api.routes import models +from bench_loop.providers import openai_compat + + +def test_openai_endpoint_detection_by_port_and_host(): + assert models._is_openai_compat_endpoint("http://127.0.0.1:8088") + assert models._is_openai_compat_endpoint("http://localhost:11451") + assert models._is_openai_compat_endpoint("https://openrouter.ai/api") + assert models._is_openai_compat_endpoint("https://api.openai.com/v1") + assert not models._is_openai_compat_endpoint("http://localhost:11434") + + +def test_endpoint_specific_openai_key_takes_precedence(monkeypatch): + monkeypatch.setenv( + "BENCHLOOP_OPENAI_KEYS", + "http://127.0.0.1:8000=sk-local,https://openrouter.ai/api=sk-openrouter", + ) + monkeypatch.setenv("OPENAI_API_KEY", "sk-global") + + assert openai_compat._api_key_for_endpoint("http://127.0.0.1:8000/") == "sk-local" + assert openai_compat._api_key_for_endpoint("https://openrouter.ai/api") == "sk-openrouter" + assert openai_compat._api_key_for_endpoint("http://127.0.0.1:9000") == "sk-global" + + +@pytest.mark.asyncio +async def test_preflight_uses_openai_chat_completions_for_openai_endpoint(monkeypatch): + calls: list[dict] = [] + + async def fail_ollama_version(endpoint: str): # pragma: no cover - should not be called + raise AssertionError("OpenAI-compatible preflight must not check Ollama version") + + class Response: + status_code = 200 + text = "" + + class Client: + def __init__(self, timeout): + self.timeout = timeout + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def post(self, url, json=None, headers=None): + calls.append({"url": url, "json": json, "headers": headers}) + return Response() + + monkeypatch.setattr(models, "_fetch_ollama_version", fail_ollama_version) + monkeypatch.setattr(models.httpx, "AsyncClient", Client) + + result = await models.preflight_model( + endpoint="http://127.0.0.1:8088", + model="stepfun-ai/Step-3.7-Flash", + ) + + assert result["ok"] is True + assert calls[0]["url"] == "http://127.0.0.1:8088/v1/chat/completions" + assert calls[0]["json"]["max_tokens"] == 16 + assert "options" not in calls[0]["json"] + + +@pytest.mark.asyncio +async def test_chat_route_forwards_openai_endpoint_auth_headers(monkeypatch): + monkeypatch.setenv("BENCHLOOP_OPENAI_KEYS", "http://127.0.0.1:8088=sk-local") + calls: list[dict] = [] + + class Response: + def raise_for_status(self): + return None + + def json(self): + return { + "choices": [{"message": {"content": "ok"}}], + "usage": {"prompt_tokens": 1, "completion_tokens": 1}, + } + + class Client: + def __init__(self, timeout): + self.timeout = timeout + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc, tb): + return False + + async def post(self, url, json=None, headers=None): + calls.append({"url": url, "json": json, "headers": headers}) + return Response() + + monkeypatch.setattr(chat_routes.httpx, "AsyncClient", Client) + + result = await chat_routes.chat_generate( + chat_routes.ChatRequest( + model="local-model", + endpoint="http://127.0.0.1:8088", + provider="openai_compat", + prompt="Say ok", + ) + ) + + assert result["message"]["content"] == "ok" + assert calls[0]["url"] == "http://127.0.0.1:8088/v1/chat/completions" + assert calls[0]["headers"]["Authorization"] == "Bearer sk-local" diff --git a/tests/test_runtime_robustness.py b/tests/test_runtime_robustness.py new file mode 100644 index 0000000..839a1b0 --- /dev/null +++ b/tests/test_runtime_robustness.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import pytest + +from bench_loop import hardware +from bench_loop.dashboard.api.routes import benchmark + + +def test_nvidia_smi_na_values_do_not_crash_gpu_detection(monkeypatch): + monkeypatch.delenv("BENCHLOOP_GPU", raising=False) + monkeypatch.delenv("BENCHLOOP_HARDWARE_LABEL", raising=False) + monkeypatch.setattr(hardware.shutil, "which", lambda name: "/usr/bin/nvidia-smi") + monkeypatch.setattr( + hardware, + "_run_command", + lambda command: "NVIDIA GB10 Grace Blackwell, [N/A], [N/A]", + ) + + result = hardware._detect_gpu() + + assert result["gpu"] == "NVIDIA GB10 Grace Blackwell" + assert result["gpu_memory_gb"] == 0.0 + assert result["gpu_temperature_c"] is None + assert result["gpu_details"] == [ + {"name": "NVIDIA GB10 Grace Blackwell", "memory_gb": 0.0, "temperature_c": None} + ] + + +@pytest.mark.asyncio +async def test_active_run_response_omits_internal_task(monkeypatch): + internal_task = object() + monkeypatch.setitem( + benchmark._active_runs, + "active123", + { + "run_id": "active123", + "status": "running", + "task": internal_task, + "events": [], + }, + ) + + result = await benchmark.get_run("active123") + + assert result["status"] == "running" + assert "task" not in result + assert benchmark._active_runs["active123"]["task"] is internal_task