diff --git a/ops-controller/Dockerfile b/ops-controller/Dockerfile index 39e1271..f788762 100644 --- a/ops-controller/Dockerfile +++ b/ops-controller/Dockerfile @@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certifi COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -COPY main.py audit.py model_registry.py gpu_assignments_fmt.py ./ +COPY main.py audit.py model_registry.py gpu_assignments_fmt.py llamacpp_flags.py ./ # Run as non-root user (docker group for socket access) RUN groupadd -g 999 docker && useradd -m -u 1000 -G docker appuser diff --git a/ops-controller/llamacpp_flags.py b/ops-controller/llamacpp_flags.py new file mode 100644 index 0000000..ac4c0dd --- /dev/null +++ b/ops-controller/llamacpp_flags.py @@ -0,0 +1,261 @@ +"""Single source of truth for llama.cpp launch flags the dashboard/ops-controller +may set, and how each is validated. + +Pure logic (no FastAPI/docker) so it is unit-testable and importable by both the +API layer (validation + the env-key allowlist) and the render step. The dashboard +fetches these descriptors to build its flag UI. MTP is exposed as two virtual +flags (MTP_ENABLED / MTP_N_MAX) that render into LLAMACPP_EXTRA_ARGS. +""" +from __future__ import annotations + +import re + +# Mainline llama.cpp KV cache types (the fork-only tbq*/tbqp* are intentionally +# excluded — they do not exist on the pinned ggml-org build). +_KV_TYPES = {"q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "iq4_nl", "f16"} +# EXTRA_ARGS is word-split into argv (NOT shell-eval'd), but keep a strict +# whitelist anyway as defense-in-depth against injection via the run script. +_EXTRA_ARGS_RE = re.compile(r"^[a-zA-Z0-9 _.=:/-]*$") +_OVERRIDE_KV_RE = re.compile(r"^[\w.]+=[a-z0-9]+:.+$") + + +def _int(lo, hi): + def v(val): + try: + n = int(str(val).strip()) + except (TypeError, ValueError): + return "must be an integer" + if n < lo or n > hi: + return f"must be between {lo} and {hi}" + return None + return v + + +def _float_min(lo): + def v(val): + try: + f = float(str(val).strip()) + except (TypeError, ValueError): + return "must be a number" + if f < lo: + return f"must be >= {lo}" + return None + return v + + +def _enum(choices): + def v(val): + return None if str(val) in choices else f"must be one of {sorted(choices)}" + return v + + +def _bool(val): + return None if str(val) in {"0", "1"} else "must be 0 or 1" + + +def _override_kv(val): + s = str(val).strip() + if s == "": + return None + return (None if _OVERRIDE_KV_RE.match(s) + else "must be key=type:value (e.g. arch.context_length=int:524288) or empty") + + +def _extra_args(val): + return (None if _EXTRA_ARGS_RE.match(str(val)) + else "contains disallowed characters (allowed: letters, digits, space, and _ . = : / -)") + + +def _gguf(val): + return None if str(val).strip().endswith(".gguf") else "must be a .gguf filename" + + +def _gguf_or_empty(val): + s = str(val).strip() + return None if s == "" or s.endswith(".gguf") else "must be a .gguf path or empty" + + +# key -> {group, kind, validate}. `kind` drives the UI input widget. +FLAGS = { + "LLAMACPP_MODEL": {"group": "core", "kind": "model", "validate": _gguf}, + "LLAMACPP_CTX_SIZE": {"group": "core", "kind": "int", "validate": _int(4096, 1048576)}, + "LLAMACPP_GPU_LAYERS": {"group": "core", "kind": "int", "validate": _int(-1, 1000)}, + "LLAMACPP_ROPE_SCALING": {"group": "context", "kind": "enum", "validate": _enum({"none", "linear", "yarn"})}, + "LLAMACPP_ROPE_SCALE": {"group": "context", "kind": "float", "validate": _float_min(1.0)}, + "LLAMACPP_YARN_ORIG_CTX": {"group": "context", "kind": "int", "validate": _int(0, 1048576)}, + "LLAMACPP_OVERRIDE_KV": {"group": "context", "kind": "string", "validate": _override_kv}, + "LLAMACPP_FLASH_ATTN": {"group": "attention", "kind": "enum", "validate": _enum({"auto", "on", "off"})}, + "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": {"group": "attention", "kind": "bool", "validate": _bool}, + "LLAMACPP_KV_CACHE_TYPE_K": {"group": "attention", "kind": "enum", "validate": _enum(_KV_TYPES)}, + "LLAMACPP_KV_CACHE_TYPE_V": {"group": "attention", "kind": "enum", "validate": _enum(_KV_TYPES)}, + "LLAMACPP_N_PREDICT": {"group": "gen", "kind": "int", "validate": _int(0, 1048576)}, + "LLAMACPP_REASONING_BUDGET": {"group": "gen", "kind": "int", "validate": _int(0, 1048576)}, + "LLAMACPP_MMPROJ": {"group": "multimodal", "kind": "path", "validate": _gguf_or_empty}, + "LLAMACPP_PARALLEL": {"group": "advanced", "kind": "int", "validate": _int(1, 64)}, + "LLAMACPP_USE_MMAP": {"group": "advanced", "kind": "bool", "validate": _bool}, + "LLAMACPP_EXTRA_ARGS": {"group": "advanced", "kind": "string", "validate": _extra_args}, +} + +# Virtual UI flags — rendered into LLAMACPP_EXTRA_ARGS, never written as raw env keys. +VIRTUAL = { + "MTP_ENABLED": {"group": "mtp", "kind": "bool", "validate": _bool}, + "MTP_N_MAX": {"group": "mtp", "kind": "int", "validate": _int(1, 6)}, +} + +# The raw .env keys this module manages (excludes virtual flags). +ENV_KEYS = set(FLAGS) + +# Baseline defaults (model-agnostic). effective(flag) = override if set else default. +# Per-model specifics (a 512K model's ctx/rope, vision mmproj, MTP) live as the +# model's overrides in the registry — NOT here. +DEFAULTS = { + "LLAMACPP_MODEL": "", # required — no sensible default; endpoint rejects empty + "LLAMACPP_CTX_SIZE": "262144", + "LLAMACPP_GPU_LAYERS": "-1", + "LLAMACPP_ROPE_SCALING": "none", + "LLAMACPP_ROPE_SCALE": "1", + "LLAMACPP_YARN_ORIG_CTX": "0", + "LLAMACPP_OVERRIDE_KV": "", + "LLAMACPP_FLASH_ATTN": "auto", + "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "1", + "LLAMACPP_KV_CACHE_TYPE_K": "q8_0", + "LLAMACPP_KV_CACHE_TYPE_V": "q8_0", + "LLAMACPP_N_PREDICT": "65536", + "LLAMACPP_REASONING_BUDGET": "32768", + "LLAMACPP_MMPROJ": "", + "LLAMACPP_PARALLEL": "1", + "LLAMACPP_USE_MMAP": "0", + "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek", +} + + +def defaults(): + """A fresh copy of the baseline defaults (one entry per managed env key).""" + return dict(DEFAULTS) + + +# JSON-safe enum choices for the dashboard form (mirrors the validators above). +CHOICES = { + "LLAMACPP_ROPE_SCALING": ["none", "linear", "yarn"], + "LLAMACPP_FLASH_ATTN": ["auto", "on", "off"], + "LLAMACPP_KV_CACHE_TYPE_K": sorted(_KV_TYPES), + "LLAMACPP_KV_CACHE_TYPE_V": sorted(_KV_TYPES), +} + +# One-line explanations surfaced as tooltips in the dashboard flag UI. +HELP = { + "LLAMACPP_MODEL": "The GGUF weights file llama.cpp loads as the chat model.", + "LLAMACPP_CTX_SIZE": "Context window in tokens. Stack-wide cap (Open WebUI, Cline, etc.); larger = more KV-cache VRAM.", + "LLAMACPP_GPU_LAYERS": "How many model layers to offload to the GPU. -1 = all on GPU.", + "LLAMACPP_ROPE_SCALING": "Method to stretch context beyond the model's native length. 'none' = native; 'yarn'/'linear' extend it.", + "LLAMACPP_ROPE_SCALE": "Context-extension factor used with rope scaling (e.g. 2 = double the native length).", + "LLAMACPP_YARN_ORIG_CTX": "The model's native (pre-extension) context length, for YaRN math. 0 = unset.", + "LLAMACPP_OVERRIDE_KV": "Override a GGUF metadata key as key=type:value (e.g. raise the declared context_length). Empty = none.", + "LLAMACPP_FLASH_ATTN": "Flash Attention. 'auto' lets llama.cpp decide; 'on' forces it (required by quantized KV cache).", + "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "Quantize the KV cache to fit longer context in VRAM (1 = on).", + "LLAMACPP_KV_CACHE_TYPE_K": "KV-cache quantization for keys. q8_0 = best quality of the quantized set; smaller types save more VRAM.", + "LLAMACPP_KV_CACHE_TYPE_V": "KV-cache quantization for values. q8_0 = best quality; smaller types save more VRAM.", + "LLAMACPP_N_PREDICT": "Hard ceiling on tokens generated per request — a backstop against runaway generation.", + "LLAMACPP_REASONING_BUDGET": "Max tokens the model may spend inside per response.", + "LLAMACPP_MMPROJ": "Vision projector (mmproj GGUF) that enables image input. Empty = text-only.", + "LLAMACPP_PARALLEL": "Number of concurrent request slots the server handles.", + "LLAMACPP_USE_MMAP": "Memory-map the model file. 0 = off (avoids stale page-cache on Docker bind mounts).", + "LLAMACPP_EXTRA_ARGS": "Raw llama-server flags appended verbatim — escape hatch for anything without a dedicated field.", + "MTP_ENABLED": "Multi-Token Prediction speculative decoding (~1.7× faster), using the model's built-in draft head.", + "MTP_N_MAX": "Max speculative draft tokens per step (1–6). Hardware-dependent; try a few values.", +} + + +def descriptors(): + """JSON-safe flag metadata for the dashboard to build its form (no callables).""" + out = [] + for key, d in {**FLAGS, **VIRTUAL}.items(): + out.append({ + "key": key, + "group": d["group"], + "kind": d["kind"], + "choices": CHOICES.get(key), + "default": DEFAULTS.get(key), + "help": HELP.get(key), + }) + return out + + +def validate(key, value): + """Return an error string if (key, value) is invalid, else None.""" + desc = FLAGS.get(key) or VIRTUAL.get(key) + if desc is None: + return f"{key} is not a managed llama.cpp flag" + return desc["validate"](value) + + +def validate_all(values): + """Return {key: error} for every invalid entry (empty dict = all valid).""" + return {k: e for k, v in values.items() if (e := validate(k, v)) is not None} + + +def mtp_to_extra_args(enabled, n_max): + """Render the MTP virtual flags into the EXTRA_ARGS fragment.""" + if not enabled: + return "" + return f"--spec-type draft-mtp --spec-draft-n-max {int(n_max)}" + + +def parse_mtp_from_extra_args(extra): + """Inverse of mtp_to_extra_args: (enabled, n_max | None).""" + s = str(extra or "") + if "draft-mtp" not in s: + return (False, None) + m = re.search(r"--spec-draft-n-max\s+(\d+)", s) + return (True, int(m.group(1)) if m else None) + + +def _strip_mtp_args(extra): + """Remove any MTP --spec-* tokens from an args string.""" + s = re.sub(r"--spec-type\s+draft-mtp\b", "", str(extra or "")) + s = re.sub(r"--spec-draft-n-max\s+\d+", "", s) + return re.sub(r"\s+", " ", s).strip() + + +def compute_effective(baseline, overrides): + """Merge baseline `.env` values with a model's overrides into the effective raw + env dict. + + `overrides` may contain virtual MTP flags (folded into EXTRA_ARGS) and `None` + values (clear -> inherit baseline). Returns only raw env keys. + """ + eff = dict(baseline) + for k, v in overrides.items(): + if k in VIRTUAL or v is None: + continue + eff[k] = str(v) + + # MTP: a structured override wins; otherwise inherit whatever EXTRA_ARGS had. + base_enabled, base_n = parse_mtp_from_extra_args(eff.get("LLAMACPP_EXTRA_ARGS", "")) + o_enabled = overrides.get("MTP_ENABLED") + o_n = overrides.get("MTP_N_MAX") + enabled = (str(o_enabled) == "1") if o_enabled is not None else base_enabled + n_max = int(o_n) if o_n not in (None, "") else (base_n or 2) + + stripped = _strip_mtp_args(eff.get("LLAMACPP_EXTRA_ARGS", "")) + frag = mtp_to_extra_args(enabled, n_max) + eff["LLAMACPP_EXTRA_ARGS"] = f"{stripped} {frag}".strip() if frag else stripped + return eff + + +def render_env_file(effective, header="# generated by ops-controller — do not hand-edit"): + """Render the effective config to override-env-file text (managed keys only).""" + lines = [header] + for key in sorted(ENV_KEYS): + if key in effective: + lines.append(f"{key}={effective[key]}") + return "\n".join(lines) + "\n" + + +def flag_view(effective): + """UI-facing view: raw env values plus the derived virtual MTP flags.""" + view = {k: v for k, v in effective.items() if k in ENV_KEYS} + enabled, n_max = parse_mtp_from_extra_args(effective.get("LLAMACPP_EXTRA_ARGS", "")) + view["MTP_ENABLED"] = "1" if enabled else "0" + view["MTP_N_MAX"] = str(n_max if n_max else 2) + return view diff --git a/ops-controller/main.py b/ops-controller/main.py index 205740f..a6f287f 100644 --- a/ops-controller/main.py +++ b/ops-controller/main.py @@ -49,6 +49,16 @@ model_registry = _ilu.module_from_spec(_mr_spec) _mr_spec.loader.exec_module(model_registry) +try: + import llamacpp_flags as lf +except ModuleNotFoundError: # pragma: no cover + import importlib.util as _ilu + _lf_spec = _ilu.spec_from_file_location( + "llamacpp_flags", str(Path(__file__).resolve().parent / "llamacpp_flags.py"), + ) + lf = _ilu.module_from_spec(_lf_spec) + _lf_spec.loader.exec_module(lf) + app = FastAPI(title="Ops Controller", version="1.0.0") logger = logging.getLogger(__name__) @@ -80,6 +90,10 @@ BASE_PATH = os.environ.get("BASE_PATH", ".") COMPOSE_FILE_ENV = os.environ.get("COMPOSE_FILE", "docker-compose.yml") +# On-disk GGUF directory (chat models + mmproj) shown in the model-config UI. +MODELS_DIR = Path(os.environ.get("LLAMACPP_MODELS_DIR", "/workspace/models/gguf")) +# Services that template LLAMACPP_CTX_SIZE and must also recreate when ctx changes. +MODEL_CONFIG_CTX_CONSUMERS = ["model-gateway"] # Services whose GPU pin the dashboard may change. GPU_ASSIGNABLE_SERVICES = {"llamacpp", "llamacpp-embed", "comfyui", "stt", "tts"} @@ -227,6 +241,65 @@ def _set_env_keys(kv: dict, request=None) -> None: _write_text_atomic(env_path, content) +# --------------------------------------------------------------------------- +# Model-config control plane (dashboard) — registry overrides -> .env -> recreate +# --------------------------------------------------------------------------- + +def _active_chat_record(): + """The enabled single-model llamacpp (chat) registry record, or None.""" + for rec in REGISTRY.list_models().values(): + if rec.service == "llamacpp" and rec.runtime == "single-model" and rec.enabled: + return rec + return None + + +def _read_env_values(keys): + """Current values for `keys` from the active (uncommented) .env lines.""" + env_path = REGISTRY.env_path + out = {} + if not env_path.exists(): + return out + content = env_path.read_text(encoding="utf-8") + for key in keys: + m = re.search(rf"^{re.escape(key)}=(.*)$", content, re.MULTILINE) + if m: + v = m.group(1).strip() + if len(v) >= 2 and v[0] == v[-1] and v[0] in "\"'": + v = v[1:-1] + out[key] = v + return out + + +def _render_model_config_to_env(effective): + """Upsert every managed flag into .env in place. The ^KEY= anchor updates only + the active line, so commented presets in the MODEL CONFIGS block survive.""" + env_path = REGISTRY.env_path + content = env_path.read_text(encoding="utf-8") if env_path.exists() else "" + for key in sorted(lf.ENV_KEYS): + if key not in effective: + continue + val = str(effective[key]) + if "\n" in val or "\r" in val: + raise HTTPException(status_code=400, detail=f"Illegal newline in {key}") + pattern = rf"^{re.escape(key)}=.*" + if re.search(pattern, content, re.MULTILINE): + content = re.sub(pattern, f"{key}={val}", content, flags=re.MULTILINE) + else: + content = content.rstrip("\n") + f"\n{key}={val}\n" + _write_text_atomic(env_path, content) + + +def _list_ggufs(mmproj=False): + """On-disk GGUF basenames. mmproj=True -> only mmproj-* files; else chat models.""" + try: + names = sorted(p.name for p in MODELS_DIR.glob("*.gguf")) + except OSError: + return [] + if mmproj: + return [n for n in names if n.startswith("mmproj")] + return [n for n in names if not n.startswith("mmproj") and "embed" not in n.lower()] + + def _live_gpus() -> dict: """Return {uuid: {"name", "total_gb", "used_gb", "util"}} via nvidia-smi. @@ -1206,6 +1279,101 @@ async def env_get(key: str, _: None = Depends(verify_token)): return {"key": key, "value": raw} +class ModelConfigBody(BaseModel): + overrides: dict = Field(default_factory=dict) + confirm: bool = False + dry_run: bool = False + + +@app.get("/model-config") +async def model_config_get(_: None = Depends(verify_token)): + """Full model-control state for the dashboard: flag descriptors, defaults, + active model, the active model's overrides, effective values, current .env, + and on-disk model/mmproj lists.""" + # Current state = the DEPLOYED .env (filled with defaults), so the UI always + # reflects what's actually running — not a possibly-stale registry record. + base = lf.defaults() + running = _read_env_values(lf.ENV_KEYS) + effective = dict(base) + effective.update(running) + rec = _active_chat_record() + if not effective.get("LLAMACPP_MODEL") and rec and rec.source.get("file"): + effective["LLAMACPP_MODEL"] = rec.source["file"] + # An "override" = an effective value that differs from the baseline default. + overrides = {k: effective[k] for k in lf.ENV_KEYS + if k in effective and effective[k] != base.get(k, "")} + return { + "flags": lf.descriptors(), + "defaults": base, + "active_model": effective.get("LLAMACPP_MODEL", ""), + "overrides": overrides, + "effective": lf.flag_view(effective), + "running": running, + "models": _list_ggufs(), + "mmprojs": _list_ggufs(mmproj=True), + } + + +@app.post("/model-config") +async def model_config_post(body: ModelConfigBody, request: Request, + _: None = Depends(verify_token)): + """Validate + apply model-config overrides via the ONE write path: persist to + the registry, render into .env, recreate llamacpp (+ ctx consumers).""" + errs = lf.validate_all({k: v for k, v in body.overrides.items() if v is not None}) + if errs: + raise HTTPException(status_code=400, detail={"validation": errs}) + if body.dry_run: + return {"would": "apply", "overrides": body.overrides} + if not body.confirm: + raise HTTPException(status_code=400, detail="Set {\"confirm\": true} to apply.") + + rec = _active_chat_record() + if rec is None: + raise HTTPException(status_code=404, detail="No active single-model llamacpp record") + + config = dict(rec.config) + source_file = rec.source.get("file", "") + ctx_touched = False + for k, v in body.overrides.items(): + if k == "LLAMACPP_MODEL": + if v: + source_file = str(v) + config.pop("LLAMACPP_MODEL", None) + continue + if k == "LLAMACPP_CTX_SIZE": + ctx_touched = True + if v is None: + config.pop(k, None) + else: + config[k] = str(v) + + overrides = dict(config) + if source_file: + overrides["LLAMACPP_MODEL"] = source_file + effective = lf.compute_effective(lf.defaults(), overrides) + + model = effective.get("LLAMACPP_MODEL", "") + if not model: + raise HTTPException(status_code=400, detail="A model file must be set") + if not (MODELS_DIR / model).exists(): + raise HTTPException(status_code=400, detail=f"Model file not found: {model}") + + _render_model_config_to_env(effective) + rec.config = config + rec.source = {**rec.source, "file": source_file} + rec.updated_by = "model-config" + REGISTRY.upsert(rec) + + services = ["llamacpp"] + (MODEL_CONFIG_CTX_CONSUMERS if ctx_touched else []) + for svc in services: + _recreate_service(svc, request) + + _audit("model_config", model, "ok", f"keys={sorted(body.overrides)}", + correlation_id=_correlation_id(request)) + return {"ok": True, "active_model": model, + "effective": lf.flag_view(effective), "recreated": services} + + @app.post("/services/{service_id}/recreate") async def service_recreate( service_id: str, body: ConfirmBody, request: Request, diff --git a/ops-controller/test_llamacpp_flags.py b/ops-controller/test_llamacpp_flags.py new file mode 100644 index 0000000..f37115f --- /dev/null +++ b/ops-controller/test_llamacpp_flags.py @@ -0,0 +1,215 @@ +"""Flag-schema validation tests (pure; no FastAPI/docker). + +llamacpp_flags is the single source of truth for which llama.cpp launch knobs the +dashboard/ops-controller may set and how each is validated. Drives API validation, +the env-key allowlist, and MTP<->extra_args rendering. +""" +from __future__ import annotations + +import importlib.util +from pathlib import Path + +_PATH = Path(__file__).resolve().parent / "llamacpp_flags.py" +_spec = importlib.util.spec_from_file_location("llamacpp_flags_under_test", _PATH) +lf = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(lf) + + +# --- enum flags --- +def test_enum_accepts_valid(): + assert lf.validate("LLAMACPP_ROPE_SCALING", "yarn") is None + assert lf.validate("LLAMACPP_FLASH_ATTN", "auto") is None + assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "q8_0") is None + + +def test_enum_rejects_invalid(): + assert lf.validate("LLAMACPP_ROPE_SCALING", "bogus") is not None + assert lf.validate("LLAMACPP_FLASH_ATTN", "maybe") is not None + # fork-only TurboQuant KV type is NOT valid on the pinned mainline build + assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "tbq3_0") is not None + + +# --- int flags with range --- +def test_int_in_range_ok(): + assert lf.validate("LLAMACPP_CTX_SIZE", "262144") is None + assert lf.validate("LLAMACPP_CTX_SIZE", 262144) is None + + +def test_int_out_of_range_or_nonint(): + assert lf.validate("LLAMACPP_CTX_SIZE", "1000") is not None # below min + assert lf.validate("LLAMACPP_CTX_SIZE", "99999999") is not None # above max + assert lf.validate("LLAMACPP_CTX_SIZE", "notanint") is not None + + +def test_gpu_layers_allows_negative_one(): + assert lf.validate("LLAMACPP_GPU_LAYERS", "-1") is None + + +# --- float flag --- +def test_rope_scale_float(): + assert lf.validate("LLAMACPP_ROPE_SCALE", "2") is None + assert lf.validate("LLAMACPP_ROPE_SCALE", "1.5") is None + assert lf.validate("LLAMACPP_ROPE_SCALE", "0.5") is not None # must be >= 1 + assert lf.validate("LLAMACPP_ROPE_SCALE", "abc") is not None + + +# --- bool flag --- +def test_bool_flag(): + assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "1") is None + assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "0") is None + assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "2") is not None + + +# --- override_kv format --- +def test_override_kv_format(): + assert lf.validate("LLAMACPP_OVERRIDE_KV", "") is None # empty = unset + assert lf.validate("LLAMACPP_OVERRIDE_KV", "qwen35moe.context_length=int:524288") is None + assert lf.validate("LLAMACPP_OVERRIDE_KV", "garbage") is not None + + +# --- extra_args whitelist (anti shell-injection) --- +def test_extra_args_whitelist(): + assert lf.validate("LLAMACPP_EXTRA_ARGS", "--spec-type draft-mtp --spec-draft-n-max 2") is None + assert lf.validate("LLAMACPP_EXTRA_ARGS", "--reasoning-format deepseek") is None + assert lf.validate("LLAMACPP_EXTRA_ARGS", "$(evil)") is not None # shell metachars + assert lf.validate("LLAMACPP_EXTRA_ARGS", "a; rm -rf x") is not None # semicolon + + +# --- MTP first-class flags --- +def test_mtp_n_max_range(): + assert lf.validate("MTP_N_MAX", "2") is None + assert lf.validate("MTP_N_MAX", "0") is not None + assert lf.validate("MTP_N_MAX", "9") is not None + + +def test_mtp_enabled_bool(): + assert lf.validate("MTP_ENABLED", "1") is None + assert lf.validate("MTP_ENABLED", "nope") is not None + + +# --- unknown key --- +def test_unknown_key_rejected(): + assert lf.validate("LLAMACPP_NOT_A_FLAG", "x") is not None + + +# --- ENV_KEYS exposes the managed set --- +def test_env_keys_set(): + assert "LLAMACPP_MODEL" in lf.ENV_KEYS + assert "LLAMACPP_CTX_SIZE" in lf.ENV_KEYS + assert "LLAMACPP_OVERRIDE_KV" in lf.ENV_KEYS + # MTP_* are virtual (rendered into EXTRA_ARGS), not raw env keys + assert "MTP_ENABLED" not in lf.ENV_KEYS + + +# --- validate_all --- +def test_validate_all_reports_only_invalid(): + errs = lf.validate_all({ + "LLAMACPP_CTX_SIZE": "262144", # ok + "LLAMACPP_ROPE_SCALING": "bogus", # bad + "LLAMACPP_FLASH_ATTN": "off", # ok + }) + assert set(errs) == {"LLAMACPP_ROPE_SCALING"} + + +# --- MTP <-> extra_args round-trip --- +def test_mtp_renders_into_extra_args(): + frag = lf.mtp_to_extra_args(True, 3) + assert "--spec-type" in frag and "draft-mtp" in frag and "3" in frag + + +def test_mtp_disabled_renders_empty(): + assert lf.mtp_to_extra_args(False, 2).strip() == "" + + +def test_mtp_parsed_from_extra_args(): + enabled, n_max = lf.parse_mtp_from_extra_args("--spec-type draft-mtp --spec-draft-n-max 4 --reasoning-format deepseek") + assert enabled is True + assert n_max == 4 + + +def test_mtp_parse_absent(): + enabled, n_max = lf.parse_mtp_from_extra_args("--reasoning-format deepseek") + assert enabled is False + + +# --- compute_effective (baseline + overrides) + render --- +BASELINE = { + "LLAMACPP_MODEL": "base.gguf", + "LLAMACPP_CTX_SIZE": "262144", + "LLAMACPP_ROPE_SCALING": "none", + "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek", + "LLAMACPP_KV_CACHE_TYPE_K": "q8_0", +} + + +def test_effective_override_wins(): + eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": "524288"}) + assert eff["LLAMACPP_CTX_SIZE"] == "524288" + assert eff["LLAMACPP_ROPE_SCALING"] == "none" # inherited from baseline + + +def test_effective_none_clears_to_baseline(): + eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": None}) + assert eff["LLAMACPP_CTX_SIZE"] == "262144" + + +def test_effective_mtp_enabled_folds_into_extra_args(): + eff = lf.compute_effective(BASELINE, {"MTP_ENABLED": "1", "MTP_N_MAX": "3"}) + ex = eff["LLAMACPP_EXTRA_ARGS"] + assert "--reasoning-format deepseek" in ex + assert "--spec-type draft-mtp" in ex and "--spec-draft-n-max 3" in ex + + +def test_effective_mtp_disabled_strips_spec_args(): + base = dict(BASELINE, + LLAMACPP_EXTRA_ARGS="--spec-type draft-mtp --spec-draft-n-max 2 --reasoning-format deepseek") + eff = lf.compute_effective(base, {"MTP_ENABLED": "0"}) + assert "draft-mtp" not in eff["LLAMACPP_EXTRA_ARGS"] + assert "--reasoning-format deepseek" in eff["LLAMACPP_EXTRA_ARGS"] + + +def test_render_env_file_only_managed_keys(): + eff = lf.compute_effective(BASELINE, {}) + text = lf.render_env_file(eff) + assert "LLAMACPP_CTX_SIZE=262144" in text + assert "MTP_ENABLED" not in text # virtual flag is not a raw env key + parsed = dict(line.split("=", 1) for line in text.strip().splitlines() + if "=" in line and not line.startswith("#")) + assert parsed["LLAMACPP_MODEL"] == "base.gguf" + + +def test_overrides_for_model_extracts_mtp_virtuals(): + # given an effective EXTRA_ARGS with MTP, the UI-facing view exposes the virtuals + view = lf.flag_view({"LLAMACPP_EXTRA_ARGS": "--spec-type draft-mtp --spec-draft-n-max 4"}) + assert view["MTP_ENABLED"] == "1" + assert view["MTP_N_MAX"] == "4" + + +def test_defaults_cover_every_managed_key(): + d = lf.defaults() + assert set(d) == lf.ENV_KEYS # one default per managed flag, no extras + # every default value is itself valid + assert lf.validate_all({k: v for k, v in d.items() if k != "LLAMACPP_MODEL"}) == {} + + +def test_reset_to_default_via_effective(): + # an override sets ctx high; clearing it (None) falls back to the default baseline + base = lf.defaults() + eff = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": "524288"}) + assert eff["LLAMACPP_CTX_SIZE"] == "524288" + eff2 = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": None}) + assert eff2["LLAMACPP_CTX_SIZE"] == "262144" # default + + +def test_descriptors_are_json_safe_and_cover_flags(): + import json + desc = lf.descriptors() + json.dumps(desc) # must be serializable (no callables) + keys = {d["key"] for d in desc} + assert lf.ENV_KEYS <= keys + assert {"MTP_ENABLED", "MTP_N_MAX"} <= keys + rope = next(d for d in desc if d["key"] == "LLAMACPP_ROPE_SCALING") + assert rope["choices"] == ["none", "linear", "yarn"] + assert rope["kind"] == "enum" + # every flag carries a non-empty help string for the UI tooltip + assert all(d.get("help") for d in desc), [d["key"] for d in desc if not d.get("help")] diff --git a/ops-controller/test_model_config_endpoint.py b/ops-controller/test_model_config_endpoint.py new file mode 100644 index 0000000..5c2eb88 --- /dev/null +++ b/ops-controller/test_model_config_endpoint.py @@ -0,0 +1,120 @@ +"""Endpoint tests for GET/POST /model-config (the dashboard control-plane API). + +Uses temp .env + registry + models dir, reloads main against them, and stubs +_recreate_service so no docker runs. +""" +import importlib +import json + +import pytest +from fastapi.testclient import TestClient + +TOKEN = "test-token-for-test" +AUTH = {"Authorization": f"Bearer {TOKEN}"} + + +@pytest.fixture +def app_env(tmp_path, monkeypatch): + env = tmp_path / ".env" + env.write_text( + "# === MODEL CONFIGS ===\n" + "LLAMACPP_MODEL=base.gguf\n" + "LLAMACPP_CTX_SIZE=262144\n" + "LLAMACPP_ROPE_SCALING=none\n" + "LLAMACPP_EXTRA_ARGS=--reasoning-format deepseek\n" + "# LLAMACPP_MODEL=preset-a3b.gguf\n", # commented preset MUST survive edits + encoding="utf-8", + ) + reg = tmp_path / "registry.json" + reg.write_text(json.dumps({"version": 1, "models": { + "local-chat": {"id": "local-chat", "kind": "chat", "service": "llamacpp", + "runtime": "single-model", "enabled": True, + "source": {"file": "base.gguf"}, "config": {}, + "gpu_uuid": None, "est_vram_gb": 0.0, + "updated_by": "test", "updated_at": None}}}), encoding="utf-8") + models = tmp_path / "gguf" + models.mkdir() + (models / "base.gguf").write_bytes(b"x") + (models / "dense27b.gguf").write_bytes(b"x") + + monkeypatch.setenv("OPS_CONTROLLER_TOKEN", TOKEN) + monkeypatch.setenv("OPS_ENV_PATH", str(env)) + monkeypatch.setenv("MODEL_REGISTRY_PATH", str(reg)) + monkeypatch.setenv("LLAMACPP_MODELS_DIR", str(models)) + + import ops_controller.main as m + importlib.reload(m) + calls = [] + monkeypatch.setattr(m, "_recreate_service", + lambda svc, request=None: (calls.append(svc), {"ok": True, "service": svc})[1]) + return m, env, reg, calls + + +def test_get_model_config(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).get("/model-config", headers=AUTH) + assert r.status_code == 200, r.text + b = r.json() + assert b["active_model"] == "base.gguf" + assert any(d["key"] == "LLAMACPP_CTX_SIZE" for d in b["flags"]) + assert "base.gguf" in b["models"] and "dense27b.gguf" in b["models"] + assert b["effective"]["LLAMACPP_CTX_SIZE"] == "262144" + + +def test_get_requires_auth(app_env): + m, env, reg, calls = app_env + assert TestClient(m.app).get("/model-config").status_code in (401, 403) + + +def test_post_sets_override_and_recreates(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).post("/model-config", headers=AUTH, + json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}}) + assert r.status_code == 200, r.text + txt = env.read_text() + assert "LLAMACPP_CTX_SIZE=524288" in txt + assert "# LLAMACPP_MODEL=preset-a3b.gguf" in txt # commented preset preserved + assert "llamacpp" in calls + cfg = json.loads(reg.read_text())["models"]["local-chat"]["config"] + assert cfg.get("LLAMACPP_CTX_SIZE") == "524288" + + +def test_post_validation_400_no_recreate(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).post("/model-config", headers=AUTH, + json={"confirm": True, "overrides": {"LLAMACPP_ROPE_SCALING": "bogus"}}) + assert r.status_code == 400 + assert "LLAMACPP_ROPE_SCALING" in r.text + assert calls == [] + + +def test_post_requires_confirm(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).post("/model-config", headers=AUTH, + json={"overrides": {"LLAMACPP_CTX_SIZE": "524288"}}) + assert r.status_code == 400 + + +def test_post_clear_reverts_to_default(app_env): + m, env, reg, calls = app_env + c = TestClient(m.app) + c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}}) + c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": None}}) + assert "LLAMACPP_CTX_SIZE=262144" in env.read_text() # back to default baseline + + +def test_post_model_swap_updates_source(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).post("/model-config", headers=AUTH, + json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "dense27b.gguf"}}) + assert r.status_code == 200, r.text + assert "LLAMACPP_MODEL=dense27b.gguf" in env.read_text() + assert json.loads(reg.read_text())["models"]["local-chat"]["source"]["file"] == "dense27b.gguf" + + +def test_post_rejects_missing_model_file(app_env): + m, env, reg, calls = app_env + r = TestClient(m.app).post("/model-config", headers=AUTH, + json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "nope.gguf"}}) + assert r.status_code == 400 + assert calls == []