Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,17 @@ benchloop run --model your-model --provider openai_compat --endpoint http://your

The CLI flag takes precedence over the env var. For Ollama and local providers without auth, neither is needed.

When you use multiple OpenAI-compatible endpoints, set per-endpoint keys with
`BENCHLOOP_OPENAI_KEYS`. Entries are comma-separated `endpoint=key` pairs and
the endpoint must match the base URL without a trailing slash:

```bash
export BENCHLOOP_OPENAI_KEYS="http://127.0.0.1:8000=sk-local,https://openrouter.ai/api=sk-or-..."
```

BenchLoop uses the matching endpoint-specific key first, then falls back to
`OPENAI_API_KEY`.

### Launch the local dashboard

v0.2.0+ ships the full FastAPI + React dashboard inside the wheel. After `pipx install benchloop-cli`:
Expand Down
4 changes: 3 additions & 1 deletion bench_loop/dashboard/api/routes/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,9 @@ async def get_run(run_id: str):
"""Get detailed run result."""
# Check active runs first
if run_id in _active_runs:
return _active_runs[run_id]
state = dict(_active_runs[run_id])
state.pop("task", None)
return state

# Check disk
run_dir = RUNS_DIR / run_id
Expand Down
3 changes: 3 additions & 0 deletions bench_loop/dashboard/api/routes/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

from bench_loop.providers.openai_compat import _auth_headers as _openai_headers

router = APIRouter()


Expand Down Expand Up @@ -61,6 +63,7 @@ async def chat_generate(req: ChatRequest):
resp = await client.post(
f"{req.endpoint.rstrip('/')}/v1/chat/completions",
json=payload,
headers=_openai_headers(req.endpoint),
)
resp.raise_for_status()
data = resp.json()
Expand Down
118 changes: 82 additions & 36 deletions bench_loop/dashboard/api/routes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from pydantic import BaseModel
import httpx

from bench_loop.providers.openai_compat import _auth_headers as _openai_headers

try:
from sse_starlette.sse import EventSourceResponse
except Exception: # pragma: no cover
Expand Down Expand Up @@ -195,7 +197,7 @@ async def _fetch_ollama_models(endpoint: str) -> list[dict]:
async def _fetch_openai_models(endpoint: str) -> list[dict]:
"""Fetch models from an OpenAI-compatible endpoint."""
async with httpx.AsyncClient(timeout=5) as client:
resp = await client.get(f"{endpoint.rstrip('/')}/v1/models")
resp = await client.get(f"{endpoint.rstrip('/')}/v1/models", headers=_openai_headers(endpoint))
resp.raise_for_status()
raw = resp.json().get("data", [])
return [
Expand Down Expand Up @@ -232,20 +234,27 @@ async def _probe_provider(provider: dict) -> dict | None:
return None


_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081}
_OPENAI_HINT_PORTS = {1234, 1337, 5001, 8000, 8080, 8081, 8088, 10531, 11451}
_OPENAI_HINT_HOSTS = {"api.openai.com", "openrouter.ai"}


def _is_openai_compat_endpoint(endpoint: str) -> bool:
from urllib.parse import urlparse

try:
parsed = urlparse(endpoint if "://" in endpoint else f"http://{endpoint}")
except Exception:
return False
hostname = (parsed.hostname or "").lower()
return parsed.port in _OPENAI_HINT_PORTS or hostname in _OPENAI_HINT_HOSTS


@router.get("/models")
async def list_models(endpoint: str = Query(default="")):
"""List models. If endpoint specified, query that. Otherwise auto-detect local providers."""
if endpoint:
# If the port is well-known for OpenAI-compatible servers, try that first.
from urllib.parse import urlparse
try:
port = urlparse(endpoint).port
except Exception:
port = None
order: list[str] = ["openai", "ollama"] if port in _OPENAI_HINT_PORTS else ["ollama", "openai"]
order: list[str] = ["openai", "ollama"] if _is_openai_compat_endpoint(endpoint) else ["ollama", "openai"]
last_error: str | None = None
for kind in order:
try:
Expand Down Expand Up @@ -307,39 +316,68 @@ async def preflight_model(
"""
ep = endpoint.rstrip("/")

# Version check against our support table (fast path, no model load needed)
version = await _fetch_ollama_version(ep)
installed_tuple = _version_tuple(version) if version else (0,)
# We don't have the details dict for an arbitrary model name — do a best
# effort classification from the name alone.
support = _classify_model_support(model, {"family": "", "quantization_level": ""})
required = support["required_version"]
if required and installed_tuple < _version_tuple(required):
is_openai_compat = _is_openai_compat_endpoint(ep)

# Ollama version checks only apply to Ollama endpoints. OpenAI-compatible
# servers such as llama.cpp/vLLM do not expose /api/version and should not
# be blocked by Ollama architecture/version rules.
version = None
if not is_openai_compat:
version = await _fetch_ollama_version(ep)
installed_tuple = _version_tuple(version) if version else (0,)
# We don't have the details dict for an arbitrary model name — do a best
# effort classification from the name alone.
support = _classify_model_support(model, {"family": "", "quantization_level": ""})
required = support["required_version"]
if required and installed_tuple < _version_tuple(required):
return {
"ok": False,
"reason": "version_mismatch",
"message": (
f"Model `{model}` needs Ollama {required}+ ({support['reason']}). "
f"Installed: {version or 'unknown'}. Upgrade with "
f"`brew upgrade ollama` then restart `ollama serve`."
),
"required_version": required,
"provider_version": version,
"raw": None,
}

# Actual load test: minimum-cost chat round-trip
try:
async with httpx.AsyncClient(timeout=15) as client:
if is_openai_compat:
resp = await client.post(
f"{ep}/v1/chat/completions",
json={
"model": model,
"messages": [{"role": "user", "content": "ok"}],
"stream": False,
"max_tokens": 16,
},
headers=_openai_headers(ep),
)
else:
resp = await client.post(
f"{ep}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": "ok"}],
"stream": False,
"options": {"num_predict": 1},
},
)
except httpx.ReadTimeout as exc:
return {
"ok": False,
"reason": "version_mismatch",
"ok": True,
"reason": "load_timeout",
"message": (
f"Model `{model}` needs Ollama {required}+ ({support['reason']}). "
f"Installed: {version or 'unknown'}. Upgrade with "
f"`brew upgrade ollama` then restart `ollama serve`."
f"Model `{model}` did not answer within the quick preflight window. "
"Continuing with the benchmark so large models can finish loading normally."
),
"required_version": required,
"raw": str(exc),
"provider_version": version,
"raw": None,
}

# Actual load test: minimum-cost chat round-trip
try:
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(
f"{ep}/api/chat",
json={
"model": model,
"messages": [{"role": "user", "content": "ok"}],
"stream": False,
"options": {"num_predict": 1},
},
)
except httpx.HTTPError as exc:
return {
"ok": False,
Expand Down Expand Up @@ -386,6 +424,14 @@ async def preflight_model(
}

if "model" in lower and "not found" in lower:
if is_openai_compat:
return {
"ok": False,
"reason": "not_found",
"message": f"`{model}` is not available at {ep}. Check the model id from `/v1/models`.",
"raw": raw,
"provider_version": version,
}
return {
"ok": False,
"reason": "not_found",
Expand Down
15 changes: 13 additions & 2 deletions bench_loop/hardware.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ def _env_hardware_overrides() -> dict[str, object]:
return overrides


def _parse_nvidia_float(value: str) -> float | None:
cleaned = value.strip()
if not cleaned or cleaned.upper() == "[N/A]":
return None
try:
return float(cleaned)
except ValueError:
return None


def _detect_gpu() -> dict[str, object]:
overrides = _env_hardware_overrides()
if overrides.get("gpu") or overrides.get("hardware_label"):
Expand Down Expand Up @@ -133,8 +143,9 @@ def _detect_gpu() -> dict[str, object]:
if len(parts) < 3:
continue
name, memory_mb, temp_c = parts[0], parts[1], parts[2]
memory_gb = float(memory_mb) / 1024 if memory_mb else 0.0
temperature = float(temp_c) if temp_c else None
memory_value = _parse_nvidia_float(memory_mb)
temperature = _parse_nvidia_float(temp_c)
memory_gb = memory_value / 1024 if memory_value is not None else 0.0
details.append({"name": name, "memory_gb": memory_gb, "temperature_c": temperature})
names.append(name)
total_memory_gb += memory_gb
Expand Down
21 changes: 16 additions & 5 deletions bench_loop/providers/openai_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,20 @@
_STREAM_TIMEOUT = httpx.Timeout(connect=15.0, read=600.0, write=60.0, pool=60.0)


def _auth_headers() -> dict[str, str]:
def _api_key_for_endpoint(endpoint: str) -> str:
ep = endpoint.rstrip("/")
raw = os.getenv("BENCHLOOP_OPENAI_KEYS", "").strip()
if raw:
for chunk in raw.split(","):
url, sep, key = chunk.partition("=")
if sep and url.strip().rstrip("/") == ep:
return key.strip()
return os.getenv("OPENAI_API_KEY", "").strip()


def _auth_headers(endpoint: str) -> dict[str, str]:
headers = {"Content-Type": "application/json"}
api_key = os.getenv("OPENAI_API_KEY", "")
api_key = _api_key_for_endpoint(endpoint)
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers
Expand All @@ -27,7 +38,7 @@ async def list_models(endpoint: str) -> list[str]:
base_url = endpoint.rstrip("/")
try:
async with httpx.AsyncClient(timeout=_HTTP_TIMEOUT) as client:
response = await client.get(f"{base_url}/v1/models", headers=_auth_headers())
response = await client.get(f"{base_url}/v1/models", headers=_auth_headers(endpoint))
response.raise_for_status()
except Exception:
return []
Expand Down Expand Up @@ -63,7 +74,7 @@ async def chat(endpoint: str, model: str, messages: list[dict[str, Any]], **kwar
response = await client.post(
f"{base_url}/v1/chat/completions",
json=payload,
headers=_auth_headers(),
headers=_auth_headers(endpoint),
)
response.raise_for_status()
except Exception as exc:
Expand Down Expand Up @@ -168,7 +179,7 @@ async def chat_streaming(
"POST",
f"{base_url}/v1/chat/completions",
json=payload,
headers=_auth_headers(),
headers=_auth_headers(endpoint),
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
Expand Down
Loading