From a5317c4dbea61991ff245bc506c73ec06a7ed02e Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Tue, 23 Jun 2026 11:11:54 -0400
Subject: [PATCH 1/3] feat(ops-controller): model-config control-plane API
 (/model-config)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Backend for dashboard-driven llama.cpp model control: every launch flag becomes
a first-class, validated, editable knob through one API, with the registry as the
per-model override store and .env as the rendered baseline — ONE write path, so
the registry/.env drift that bit us before is structurally impossible.

- llamacpp_flags.py: declarative flag schema (types/ranges/enums), per-flag
  validation, baseline defaults, baseline+override merge with reset-to-default,
  MTP<->extra_args folding, and JSON-safe descriptors for the UI.
- GET /model-config: flags + defaults + active model + overrides + effective +
  on-disk model/mmproj lists (drives the whole UI in one call).
- POST /model-config: validate -> persist to the active registry record ->
  render into .env (per-line upsert; commented presets survive) -> recreate
  llamacpp (+ model-gateway when ctx changes).

Tests: 27 (schema/render) + 8 (endpoints); full ops-controller suite = 69 passing.
The dashboard flag-card UI that consumes this lands in a follow-up.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ops-controller/llamacpp_flags.py             | 237 +++++++++++++++++++
 ops-controller/main.py                       | 161 +++++++++++++
 ops-controller/test_llamacpp_flags.py        | 213 +++++++++++++++++
 ops-controller/test_model_config_endpoint.py | 120 ++++++++++
 4 files changed, 731 insertions(+)
 create mode 100644 ops-controller/llamacpp_flags.py
 create mode 100644 ops-controller/test_llamacpp_flags.py
 create mode 100644 ops-controller/test_model_config_endpoint.py

diff --git a/ops-controller/llamacpp_flags.py b/ops-controller/llamacpp_flags.py
new file mode 100644
index 0000000..dccdb80
--- /dev/null
+++ b/ops-controller/llamacpp_flags.py
@@ -0,0 +1,237 @@
+"""Single source of truth for llama.cpp launch flags the dashboard/ops-controller
+may set, and how each is validated.
+
+Pure logic (no FastAPI/docker) so it is unit-testable and importable by both the
+API layer (validation + the env-key allowlist) and the render step. The dashboard
+fetches these descriptors to build its flag UI. MTP is exposed as two virtual
+flags (MTP_ENABLED / MTP_N_MAX) that render into LLAMACPP_EXTRA_ARGS.
+"""
+from __future__ import annotations
+
+import re
+
+# Mainline llama.cpp KV cache types (the fork-only tbq*/tbqp* are intentionally
+# excluded — they do not exist on the pinned ggml-org build).
+_KV_TYPES = {"q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "iq4_nl", "f16"}
+# EXTRA_ARGS is word-split into argv (NOT shell-eval'd), but keep a strict
+# whitelist anyway as defense-in-depth against injection via the run script.
+_EXTRA_ARGS_RE = re.compile(r"^[a-zA-Z0-9 _.=:/-]*$")
+_OVERRIDE_KV_RE = re.compile(r"^[\w.]+=[a-z0-9]+:.+$")
+
+
+def _int(lo, hi):
+    def v(val):
+        try:
+            n = int(str(val).strip())
+        except (TypeError, ValueError):
+            return "must be an integer"
+        if n < lo or n > hi:
+            return f"must be between {lo} and {hi}"
+        return None
+    return v
+
+
+def _float_min(lo):
+    def v(val):
+        try:
+            f = float(str(val).strip())
+        except (TypeError, ValueError):
+            return "must be a number"
+        if f < lo:
+            return f"must be >= {lo}"
+        return None
+    return v
+
+
+def _enum(choices):
+    def v(val):
+        return None if str(val) in choices else f"must be one of {sorted(choices)}"
+    return v
+
+
+def _bool(val):
+    return None if str(val) in {"0", "1"} else "must be 0 or 1"
+
+
+def _override_kv(val):
+    s = str(val).strip()
+    if s == "":
+        return None
+    return (None if _OVERRIDE_KV_RE.match(s)
+            else "must be key=type:value (e.g. arch.context_length=int:524288) or empty")
+
+
+def _extra_args(val):
+    return (None if _EXTRA_ARGS_RE.match(str(val))
+            else "contains disallowed characters (allowed: letters, digits, space, and _ . = : / -)")
+
+
+def _gguf(val):
+    return None if str(val).strip().endswith(".gguf") else "must be a .gguf filename"
+
+
+def _gguf_or_empty(val):
+    s = str(val).strip()
+    return None if s == "" or s.endswith(".gguf") else "must be a .gguf path or empty"
+
+
+# key -> {group, kind, validate}. `kind` drives the UI input widget.
+FLAGS = {
+    "LLAMACPP_MODEL":                     {"group": "core",       "kind": "model",  "validate": _gguf},
+    "LLAMACPP_CTX_SIZE":                  {"group": "core",       "kind": "int",    "validate": _int(4096, 1048576)},
+    "LLAMACPP_GPU_LAYERS":                {"group": "core",       "kind": "int",    "validate": _int(-1, 1000)},
+    "LLAMACPP_ROPE_SCALING":              {"group": "context",    "kind": "enum",   "validate": _enum({"none", "linear", "yarn"})},
+    "LLAMACPP_ROPE_SCALE":                {"group": "context",    "kind": "float",  "validate": _float_min(1.0)},
+    "LLAMACPP_YARN_ORIG_CTX":             {"group": "context",    "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_OVERRIDE_KV":               {"group": "context",    "kind": "string", "validate": _override_kv},
+    "LLAMACPP_FLASH_ATTN":                {"group": "attention",  "kind": "enum",   "validate": _enum({"auto", "on", "off"})},
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": {"group": "attention", "kind": "bool", "validate": _bool},
+    "LLAMACPP_KV_CACHE_TYPE_K":           {"group": "attention",  "kind": "enum",   "validate": _enum(_KV_TYPES)},
+    "LLAMACPP_KV_CACHE_TYPE_V":           {"group": "attention",  "kind": "enum",   "validate": _enum(_KV_TYPES)},
+    "LLAMACPP_N_PREDICT":                 {"group": "gen",        "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_REASONING_BUDGET":          {"group": "gen",        "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_MMPROJ":                    {"group": "multimodal", "kind": "path",   "validate": _gguf_or_empty},
+    "LLAMACPP_PARALLEL":                  {"group": "advanced",   "kind": "int",    "validate": _int(1, 64)},
+    "LLAMACPP_USE_MMAP":                  {"group": "advanced",   "kind": "bool",   "validate": _bool},
+    "LLAMACPP_EXTRA_ARGS":                {"group": "advanced",   "kind": "string", "validate": _extra_args},
+}
+
+# Virtual UI flags — rendered into LLAMACPP_EXTRA_ARGS, never written as raw env keys.
+VIRTUAL = {
+    "MTP_ENABLED": {"group": "mtp", "kind": "bool", "validate": _bool},
+    "MTP_N_MAX":   {"group": "mtp", "kind": "int",  "validate": _int(1, 6)},
+}
+
+# The raw .env keys this module manages (excludes virtual flags).
+ENV_KEYS = set(FLAGS)
+
+# Baseline defaults (model-agnostic). effective(flag) = override if set else default.
+# Per-model specifics (a 512K model's ctx/rope, vision mmproj, MTP) live as the
+# model's overrides in the registry — NOT here.
+DEFAULTS = {
+    "LLAMACPP_MODEL": "",  # required — no sensible default; endpoint rejects empty
+    "LLAMACPP_CTX_SIZE": "262144",
+    "LLAMACPP_GPU_LAYERS": "-1",
+    "LLAMACPP_ROPE_SCALING": "none",
+    "LLAMACPP_ROPE_SCALE": "1",
+    "LLAMACPP_YARN_ORIG_CTX": "0",
+    "LLAMACPP_OVERRIDE_KV": "",
+    "LLAMACPP_FLASH_ATTN": "auto",
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "1",
+    "LLAMACPP_KV_CACHE_TYPE_K": "q8_0",
+    "LLAMACPP_KV_CACHE_TYPE_V": "q8_0",
+    "LLAMACPP_N_PREDICT": "65536",
+    "LLAMACPP_REASONING_BUDGET": "32768",
+    "LLAMACPP_MMPROJ": "",
+    "LLAMACPP_PARALLEL": "1",
+    "LLAMACPP_USE_MMAP": "0",
+    "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek",
+}
+
+
+def defaults():
+    """A fresh copy of the baseline defaults (one entry per managed env key)."""
+    return dict(DEFAULTS)
+
+
+# JSON-safe enum choices for the dashboard form (mirrors the validators above).
+CHOICES = {
+    "LLAMACPP_ROPE_SCALING": ["none", "linear", "yarn"],
+    "LLAMACPP_FLASH_ATTN": ["auto", "on", "off"],
+    "LLAMACPP_KV_CACHE_TYPE_K": sorted(_KV_TYPES),
+    "LLAMACPP_KV_CACHE_TYPE_V": sorted(_KV_TYPES),
+}
+
+
+def descriptors():
+    """JSON-safe flag metadata for the dashboard to build its form (no callables)."""
+    out = []
+    for key, d in {**FLAGS, **VIRTUAL}.items():
+        out.append({
+            "key": key,
+            "group": d["group"],
+            "kind": d["kind"],
+            "choices": CHOICES.get(key),
+            "default": DEFAULTS.get(key),
+        })
+    return out
+
+
+def validate(key, value):
+    """Return an error string if (key, value) is invalid, else None."""
+    desc = FLAGS.get(key) or VIRTUAL.get(key)
+    if desc is None:
+        return f"{key} is not a managed llama.cpp flag"
+    return desc["validate"](value)
+
+
+def validate_all(values):
+    """Return {key: error} for every invalid entry (empty dict = all valid)."""
+    return {k: e for k, v in values.items() if (e := validate(k, v)) is not None}
+
+
+def mtp_to_extra_args(enabled, n_max):
+    """Render the MTP virtual flags into the EXTRA_ARGS fragment."""
+    if not enabled:
+        return ""
+    return f"--spec-type draft-mtp --spec-draft-n-max {int(n_max)}"
+
+
+def parse_mtp_from_extra_args(extra):
+    """Inverse of mtp_to_extra_args: (enabled, n_max | None)."""
+    s = str(extra or "")
+    if "draft-mtp" not in s:
+        return (False, None)
+    m = re.search(r"--spec-draft-n-max\s+(\d+)", s)
+    return (True, int(m.group(1)) if m else None)
+
+
+def _strip_mtp_args(extra):
+    """Remove any MTP --spec-* tokens from an args string."""
+    s = re.sub(r"--spec-type\s+draft-mtp\b", "", str(extra or ""))
+    s = re.sub(r"--spec-draft-n-max\s+\d+", "", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def compute_effective(baseline, overrides):
+    """Merge baseline `.env` values with a model's overrides into the effective raw
+    env dict.
+
+    `overrides` may contain virtual MTP flags (folded into EXTRA_ARGS) and `None`
+    values (clear -> inherit baseline). Returns only raw env keys.
+    """
+    eff = dict(baseline)
+    for k, v in overrides.items():
+        if k in VIRTUAL or v is None:
+            continue
+        eff[k] = str(v)
+
+    # MTP: a structured override wins; otherwise inherit whatever EXTRA_ARGS had.
+    base_enabled, base_n = parse_mtp_from_extra_args(eff.get("LLAMACPP_EXTRA_ARGS", ""))
+    o_enabled = overrides.get("MTP_ENABLED")
+    o_n = overrides.get("MTP_N_MAX")
+    enabled = (str(o_enabled) == "1") if o_enabled is not None else base_enabled
+    n_max = int(o_n) if o_n not in (None, "") else (base_n or 2)
+
+    stripped = _strip_mtp_args(eff.get("LLAMACPP_EXTRA_ARGS", ""))
+    frag = mtp_to_extra_args(enabled, n_max)
+    eff["LLAMACPP_EXTRA_ARGS"] = f"{stripped} {frag}".strip() if frag else stripped
+    return eff
+
+
+def render_env_file(effective, header="# generated by ops-controller — do not hand-edit"):
+    """Render the effective config to override-env-file text (managed keys only)."""
+    lines = [header]
+    for key in sorted(ENV_KEYS):
+        if key in effective:
+            lines.append(f"{key}={effective[key]}")
+    return "\n".join(lines) + "\n"
+
+
+def flag_view(effective):
+    """UI-facing view: raw env values plus the derived virtual MTP flags."""
+    view = {k: v for k, v in effective.items() if k in ENV_KEYS}
+    enabled, n_max = parse_mtp_from_extra_args(effective.get("LLAMACPP_EXTRA_ARGS", ""))
+    view["MTP_ENABLED"] = "1" if enabled else "0"
+    view["MTP_N_MAX"] = str(n_max if n_max else 2)
+    return view
diff --git a/ops-controller/main.py b/ops-controller/main.py
index 205740f..447df87 100644
--- a/ops-controller/main.py
+++ b/ops-controller/main.py
@@ -49,6 +49,16 @@
     model_registry = _ilu.module_from_spec(_mr_spec)
     _mr_spec.loader.exec_module(model_registry)
 
+try:
+    import llamacpp_flags as lf
+except ModuleNotFoundError:  # pragma: no cover
+    import importlib.util as _ilu
+    _lf_spec = _ilu.spec_from_file_location(
+        "llamacpp_flags", str(Path(__file__).resolve().parent / "llamacpp_flags.py"),
+    )
+    lf = _ilu.module_from_spec(_lf_spec)
+    _lf_spec.loader.exec_module(lf)
+
 app = FastAPI(title="Ops Controller", version="1.0.0")
 logger = logging.getLogger(__name__)
 
@@ -80,6 +90,10 @@
 
 BASE_PATH = os.environ.get("BASE_PATH", ".")
 COMPOSE_FILE_ENV = os.environ.get("COMPOSE_FILE", "docker-compose.yml")
+# On-disk GGUF directory (chat models + mmproj) shown in the model-config UI.
+MODELS_DIR = Path(os.environ.get("LLAMACPP_MODELS_DIR", "/workspace/models/gguf"))
+# Services that template LLAMACPP_CTX_SIZE and must also recreate when ctx changes.
+MODEL_CONFIG_CTX_CONSUMERS = ["model-gateway"]
 
 # Services whose GPU pin the dashboard may change.
 GPU_ASSIGNABLE_SERVICES = {"llamacpp", "llamacpp-embed", "comfyui", "stt", "tts"}
@@ -227,6 +241,65 @@ def _set_env_keys(kv: dict, request=None) -> None:
     _write_text_atomic(env_path, content)
 
 
+# ---------------------------------------------------------------------------
+# Model-config control plane (dashboard) — registry overrides -> .env -> recreate
+# ---------------------------------------------------------------------------
+
+def _active_chat_record():
+    """The enabled single-model llamacpp (chat) registry record, or None."""
+    for rec in REGISTRY.list_models().values():
+        if rec.service == "llamacpp" and rec.runtime == "single-model" and rec.enabled:
+            return rec
+    return None
+
+
+def _read_env_values(keys):
+    """Current values for `keys` from the active (uncommented) .env lines."""
+    env_path = REGISTRY.env_path
+    out = {}
+    if not env_path.exists():
+        return out
+    content = env_path.read_text(encoding="utf-8")
+    for key in keys:
+        m = re.search(rf"^{re.escape(key)}=(.*)$", content, re.MULTILINE)
+        if m:
+            v = m.group(1).strip()
+            if len(v) >= 2 and v[0] == v[-1] and v[0] in "\"'":
+                v = v[1:-1]
+            out[key] = v
+    return out
+
+
+def _render_model_config_to_env(effective):
+    """Upsert every managed flag into .env in place. The ^KEY= anchor updates only
+    the active line, so commented presets in the MODEL CONFIGS block survive."""
+    env_path = REGISTRY.env_path
+    content = env_path.read_text(encoding="utf-8") if env_path.exists() else ""
+    for key in sorted(lf.ENV_KEYS):
+        if key not in effective:
+            continue
+        val = str(effective[key])
+        if "\n" in val or "\r" in val:
+            raise HTTPException(status_code=400, detail=f"Illegal newline in {key}")
+        pattern = rf"^{re.escape(key)}=.*"
+        if re.search(pattern, content, re.MULTILINE):
+            content = re.sub(pattern, f"{key}={val}", content, flags=re.MULTILINE)
+        else:
+            content = content.rstrip("\n") + f"\n{key}={val}\n"
+    _write_text_atomic(env_path, content)
+
+
+def _list_ggufs(mmproj=False):
+    """On-disk GGUF basenames. mmproj=True -> only mmproj-* files; else chat models."""
+    try:
+        names = sorted(p.name for p in MODELS_DIR.glob("*.gguf"))
+    except OSError:
+        return []
+    if mmproj:
+        return [n for n in names if n.startswith("mmproj")]
+    return [n for n in names if not n.startswith("mmproj") and "embed" not in n.lower()]
+
+
 def _live_gpus() -> dict:
     """Return {uuid: {"name", "total_gb", "used_gb", "util"}} via nvidia-smi.
 
@@ -1206,6 +1279,94 @@ async def env_get(key: str, _: None = Depends(verify_token)):
     return {"key": key, "value": raw}
 
 
+class ModelConfigBody(BaseModel):
+    overrides: dict = Field(default_factory=dict)
+    confirm: bool = False
+    dry_run: bool = False
+
+
+@app.get("/model-config")
+async def model_config_get(_: None = Depends(verify_token)):
+    """Full model-control state for the dashboard: flag descriptors, defaults,
+    active model, the active model's overrides, effective values, current .env,
+    and on-disk model/mmproj lists."""
+    rec = _active_chat_record()
+    overrides = dict(rec.config) if rec else {}
+    if rec and rec.source.get("file") and "LLAMACPP_MODEL" not in overrides:
+        overrides["LLAMACPP_MODEL"] = rec.source["file"]
+    effective = lf.compute_effective(lf.defaults(), overrides)
+    return {
+        "flags": lf.descriptors(),
+        "defaults": lf.defaults(),
+        "active_model": effective.get("LLAMACPP_MODEL", ""),
+        "overrides": overrides,
+        "effective": lf.flag_view(effective),
+        "running": _read_env_values(lf.ENV_KEYS),
+        "models": _list_ggufs(),
+        "mmprojs": _list_ggufs(mmproj=True),
+    }
+
+
+@app.post("/model-config")
+async def model_config_post(body: ModelConfigBody, request: Request,
+                            _: None = Depends(verify_token)):
+    """Validate + apply model-config overrides via the ONE write path: persist to
+    the registry, render into .env, recreate llamacpp (+ ctx consumers)."""
+    errs = lf.validate_all({k: v for k, v in body.overrides.items() if v is not None})
+    if errs:
+        raise HTTPException(status_code=400, detail={"validation": errs})
+    if body.dry_run:
+        return {"would": "apply", "overrides": body.overrides}
+    if not body.confirm:
+        raise HTTPException(status_code=400, detail="Set {\"confirm\": true} to apply.")
+
+    rec = _active_chat_record()
+    if rec is None:
+        raise HTTPException(status_code=404, detail="No active single-model llamacpp record")
+
+    config = dict(rec.config)
+    source_file = rec.source.get("file", "")
+    ctx_touched = False
+    for k, v in body.overrides.items():
+        if k == "LLAMACPP_MODEL":
+            if v:
+                source_file = str(v)
+            config.pop("LLAMACPP_MODEL", None)
+            continue
+        if k == "LLAMACPP_CTX_SIZE":
+            ctx_touched = True
+        if v is None:
+            config.pop(k, None)
+        else:
+            config[k] = str(v)
+
+    overrides = dict(config)
+    if source_file:
+        overrides["LLAMACPP_MODEL"] = source_file
+    effective = lf.compute_effective(lf.defaults(), overrides)
+
+    model = effective.get("LLAMACPP_MODEL", "")
+    if not model:
+        raise HTTPException(status_code=400, detail="A model file must be set")
+    if not (MODELS_DIR / model).exists():
+        raise HTTPException(status_code=400, detail=f"Model file not found: {model}")
+
+    _render_model_config_to_env(effective)
+    rec.config = config
+    rec.source = {**rec.source, "file": source_file}
+    rec.updated_by = "model-config"
+    REGISTRY.upsert(rec)
+
+    services = ["llamacpp"] + (MODEL_CONFIG_CTX_CONSUMERS if ctx_touched else [])
+    for svc in services:
+        _recreate_service(svc, request)
+
+    _audit("model_config", model, "ok", f"keys={sorted(body.overrides)}",
+           correlation_id=_correlation_id(request))
+    return {"ok": True, "active_model": model,
+            "effective": lf.flag_view(effective), "recreated": services}
+
+
 @app.post("/services/{service_id}/recreate")
 async def service_recreate(
     service_id: str, body: ConfirmBody, request: Request,
diff --git a/ops-controller/test_llamacpp_flags.py b/ops-controller/test_llamacpp_flags.py
new file mode 100644
index 0000000..27fcef3
--- /dev/null
+++ b/ops-controller/test_llamacpp_flags.py
@@ -0,0 +1,213 @@
+"""Flag-schema validation tests (pure; no FastAPI/docker).
+
+llamacpp_flags is the single source of truth for which llama.cpp launch knobs the
+dashboard/ops-controller may set and how each is validated. Drives API validation,
+the env-key allowlist, and MTP<->extra_args rendering.
+"""
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+
+_PATH = Path(__file__).resolve().parent / "llamacpp_flags.py"
+_spec = importlib.util.spec_from_file_location("llamacpp_flags_under_test", _PATH)
+lf = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(lf)
+
+
+# --- enum flags ---
+def test_enum_accepts_valid():
+    assert lf.validate("LLAMACPP_ROPE_SCALING", "yarn") is None
+    assert lf.validate("LLAMACPP_FLASH_ATTN", "auto") is None
+    assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "q8_0") is None
+
+
+def test_enum_rejects_invalid():
+    assert lf.validate("LLAMACPP_ROPE_SCALING", "bogus") is not None
+    assert lf.validate("LLAMACPP_FLASH_ATTN", "maybe") is not None
+    # fork-only TurboQuant KV type is NOT valid on the pinned mainline build
+    assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "tbq3_0") is not None
+
+
+# --- int flags with range ---
+def test_int_in_range_ok():
+    assert lf.validate("LLAMACPP_CTX_SIZE", "262144") is None
+    assert lf.validate("LLAMACPP_CTX_SIZE", 262144) is None
+
+
+def test_int_out_of_range_or_nonint():
+    assert lf.validate("LLAMACPP_CTX_SIZE", "1000") is not None        # below min
+    assert lf.validate("LLAMACPP_CTX_SIZE", "99999999") is not None    # above max
+    assert lf.validate("LLAMACPP_CTX_SIZE", "notanint") is not None
+
+
+def test_gpu_layers_allows_negative_one():
+    assert lf.validate("LLAMACPP_GPU_LAYERS", "-1") is None
+
+
+# --- float flag ---
+def test_rope_scale_float():
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "2") is None
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "1.5") is None
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "0.5") is not None   # must be >= 1
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "abc") is not None
+
+
+# --- bool flag ---
+def test_bool_flag():
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "1") is None
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "0") is None
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "2") is not None
+
+
+# --- override_kv format ---
+def test_override_kv_format():
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "") is None  # empty = unset
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "qwen35moe.context_length=int:524288") is None
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "garbage") is not None
+
+
+# --- extra_args whitelist (anti shell-injection) ---
+def test_extra_args_whitelist():
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "--spec-type draft-mtp --spec-draft-n-max 2") is None
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "--reasoning-format deepseek") is None
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "$(evil)") is not None      # shell metachars
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "a; rm -rf x") is not None  # semicolon
+
+
+# --- MTP first-class flags ---
+def test_mtp_n_max_range():
+    assert lf.validate("MTP_N_MAX", "2") is None
+    assert lf.validate("MTP_N_MAX", "0") is not None
+    assert lf.validate("MTP_N_MAX", "9") is not None
+
+
+def test_mtp_enabled_bool():
+    assert lf.validate("MTP_ENABLED", "1") is None
+    assert lf.validate("MTP_ENABLED", "nope") is not None
+
+
+# --- unknown key ---
+def test_unknown_key_rejected():
+    assert lf.validate("LLAMACPP_NOT_A_FLAG", "x") is not None
+
+
+# --- ENV_KEYS exposes the managed set ---
+def test_env_keys_set():
+    assert "LLAMACPP_MODEL" in lf.ENV_KEYS
+    assert "LLAMACPP_CTX_SIZE" in lf.ENV_KEYS
+    assert "LLAMACPP_OVERRIDE_KV" in lf.ENV_KEYS
+    # MTP_* are virtual (rendered into EXTRA_ARGS), not raw env keys
+    assert "MTP_ENABLED" not in lf.ENV_KEYS
+
+
+# --- validate_all ---
+def test_validate_all_reports_only_invalid():
+    errs = lf.validate_all({
+        "LLAMACPP_CTX_SIZE": "262144",        # ok
+        "LLAMACPP_ROPE_SCALING": "bogus",     # bad
+        "LLAMACPP_FLASH_ATTN": "off",         # ok
+    })
+    assert set(errs) == {"LLAMACPP_ROPE_SCALING"}
+
+
+# --- MTP <-> extra_args round-trip ---
+def test_mtp_renders_into_extra_args():
+    frag = lf.mtp_to_extra_args(True, 3)
+    assert "--spec-type" in frag and "draft-mtp" in frag and "3" in frag
+
+
+def test_mtp_disabled_renders_empty():
+    assert lf.mtp_to_extra_args(False, 2).strip() == ""
+
+
+def test_mtp_parsed_from_extra_args():
+    enabled, n_max = lf.parse_mtp_from_extra_args("--spec-type draft-mtp --spec-draft-n-max 4 --reasoning-format deepseek")
+    assert enabled is True
+    assert n_max == 4
+
+
+def test_mtp_parse_absent():
+    enabled, n_max = lf.parse_mtp_from_extra_args("--reasoning-format deepseek")
+    assert enabled is False
+
+
+# --- compute_effective (baseline + overrides) + render ---
+BASELINE = {
+    "LLAMACPP_MODEL": "base.gguf",
+    "LLAMACPP_CTX_SIZE": "262144",
+    "LLAMACPP_ROPE_SCALING": "none",
+    "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek",
+    "LLAMACPP_KV_CACHE_TYPE_K": "q8_0",
+}
+
+
+def test_effective_override_wins():
+    eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": "524288"})
+    assert eff["LLAMACPP_CTX_SIZE"] == "524288"
+    assert eff["LLAMACPP_ROPE_SCALING"] == "none"  # inherited from baseline
+
+
+def test_effective_none_clears_to_baseline():
+    eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": None})
+    assert eff["LLAMACPP_CTX_SIZE"] == "262144"
+
+
+def test_effective_mtp_enabled_folds_into_extra_args():
+    eff = lf.compute_effective(BASELINE, {"MTP_ENABLED": "1", "MTP_N_MAX": "3"})
+    ex = eff["LLAMACPP_EXTRA_ARGS"]
+    assert "--reasoning-format deepseek" in ex
+    assert "--spec-type draft-mtp" in ex and "--spec-draft-n-max 3" in ex
+
+
+def test_effective_mtp_disabled_strips_spec_args():
+    base = dict(BASELINE,
+                LLAMACPP_EXTRA_ARGS="--spec-type draft-mtp --spec-draft-n-max 2 --reasoning-format deepseek")
+    eff = lf.compute_effective(base, {"MTP_ENABLED": "0"})
+    assert "draft-mtp" not in eff["LLAMACPP_EXTRA_ARGS"]
+    assert "--reasoning-format deepseek" in eff["LLAMACPP_EXTRA_ARGS"]
+
+
+def test_render_env_file_only_managed_keys():
+    eff = lf.compute_effective(BASELINE, {})
+    text = lf.render_env_file(eff)
+    assert "LLAMACPP_CTX_SIZE=262144" in text
+    assert "MTP_ENABLED" not in text  # virtual flag is not a raw env key
+    parsed = dict(line.split("=", 1) for line in text.strip().splitlines()
+                  if "=" in line and not line.startswith("#"))
+    assert parsed["LLAMACPP_MODEL"] == "base.gguf"
+
+
+def test_overrides_for_model_extracts_mtp_virtuals():
+    # given an effective EXTRA_ARGS with MTP, the UI-facing view exposes the virtuals
+    view = lf.flag_view({"LLAMACPP_EXTRA_ARGS": "--spec-type draft-mtp --spec-draft-n-max 4"})
+    assert view["MTP_ENABLED"] == "1"
+    assert view["MTP_N_MAX"] == "4"
+
+
+def test_defaults_cover_every_managed_key():
+    d = lf.defaults()
+    assert set(d) == lf.ENV_KEYS  # one default per managed flag, no extras
+    # every default value is itself valid
+    assert lf.validate_all({k: v for k, v in d.items() if k != "LLAMACPP_MODEL"}) == {}
+
+
+def test_reset_to_default_via_effective():
+    # an override sets ctx high; clearing it (None) falls back to the default baseline
+    base = lf.defaults()
+    eff = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": "524288"})
+    assert eff["LLAMACPP_CTX_SIZE"] == "524288"
+    eff2 = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": None})
+    assert eff2["LLAMACPP_CTX_SIZE"] == "262144"  # default
+
+
+def test_descriptors_are_json_safe_and_cover_flags():
+    import json
+    desc = lf.descriptors()
+    json.dumps(desc)  # must be serializable (no callables)
+    keys = {d["key"] for d in desc}
+    assert lf.ENV_KEYS <= keys
+    assert {"MTP_ENABLED", "MTP_N_MAX"} <= keys
+    rope = next(d for d in desc if d["key"] == "LLAMACPP_ROPE_SCALING")
+    assert rope["choices"] == ["none", "linear", "yarn"]
+    assert rope["kind"] == "enum"
diff --git a/ops-controller/test_model_config_endpoint.py b/ops-controller/test_model_config_endpoint.py
new file mode 100644
index 0000000..5c2eb88
--- /dev/null
+++ b/ops-controller/test_model_config_endpoint.py
@@ -0,0 +1,120 @@
+"""Endpoint tests for GET/POST /model-config (the dashboard control-plane API).
+
+Uses temp .env + registry + models dir, reloads main against them, and stubs
+_recreate_service so no docker runs.
+"""
+import importlib
+import json
+
+import pytest
+from fastapi.testclient import TestClient
+
+TOKEN = "test-token-for-test"
+AUTH = {"Authorization": f"Bearer {TOKEN}"}
+
+
+@pytest.fixture
+def app_env(tmp_path, monkeypatch):
+    env = tmp_path / ".env"
+    env.write_text(
+        "# === MODEL CONFIGS ===\n"
+        "LLAMACPP_MODEL=base.gguf\n"
+        "LLAMACPP_CTX_SIZE=262144\n"
+        "LLAMACPP_ROPE_SCALING=none\n"
+        "LLAMACPP_EXTRA_ARGS=--reasoning-format deepseek\n"
+        "# LLAMACPP_MODEL=preset-a3b.gguf\n",  # commented preset MUST survive edits
+        encoding="utf-8",
+    )
+    reg = tmp_path / "registry.json"
+    reg.write_text(json.dumps({"version": 1, "models": {
+        "local-chat": {"id": "local-chat", "kind": "chat", "service": "llamacpp",
+                       "runtime": "single-model", "enabled": True,
+                       "source": {"file": "base.gguf"}, "config": {},
+                       "gpu_uuid": None, "est_vram_gb": 0.0,
+                       "updated_by": "test", "updated_at": None}}}), encoding="utf-8")
+    models = tmp_path / "gguf"
+    models.mkdir()
+    (models / "base.gguf").write_bytes(b"x")
+    (models / "dense27b.gguf").write_bytes(b"x")
+
+    monkeypatch.setenv("OPS_CONTROLLER_TOKEN", TOKEN)
+    monkeypatch.setenv("OPS_ENV_PATH", str(env))
+    monkeypatch.setenv("MODEL_REGISTRY_PATH", str(reg))
+    monkeypatch.setenv("LLAMACPP_MODELS_DIR", str(models))
+
+    import ops_controller.main as m
+    importlib.reload(m)
+    calls = []
+    monkeypatch.setattr(m, "_recreate_service",
+                        lambda svc, request=None: (calls.append(svc), {"ok": True, "service": svc})[1])
+    return m, env, reg, calls
+
+
+def test_get_model_config(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).get("/model-config", headers=AUTH)
+    assert r.status_code == 200, r.text
+    b = r.json()
+    assert b["active_model"] == "base.gguf"
+    assert any(d["key"] == "LLAMACPP_CTX_SIZE" for d in b["flags"])
+    assert "base.gguf" in b["models"] and "dense27b.gguf" in b["models"]
+    assert b["effective"]["LLAMACPP_CTX_SIZE"] == "262144"
+
+
+def test_get_requires_auth(app_env):
+    m, env, reg, calls = app_env
+    assert TestClient(m.app).get("/model-config").status_code in (401, 403)
+
+
+def test_post_sets_override_and_recreates(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    assert r.status_code == 200, r.text
+    txt = env.read_text()
+    assert "LLAMACPP_CTX_SIZE=524288" in txt
+    assert "# LLAMACPP_MODEL=preset-a3b.gguf" in txt  # commented preset preserved
+    assert "llamacpp" in calls
+    cfg = json.loads(reg.read_text())["models"]["local-chat"]["config"]
+    assert cfg.get("LLAMACPP_CTX_SIZE") == "524288"
+
+
+def test_post_validation_400_no_recreate(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_ROPE_SCALING": "bogus"}})
+    assert r.status_code == 400
+    assert "LLAMACPP_ROPE_SCALING" in r.text
+    assert calls == []
+
+
+def test_post_requires_confirm(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    assert r.status_code == 400
+
+
+def test_post_clear_reverts_to_default(app_env):
+    m, env, reg, calls = app_env
+    c = TestClient(m.app)
+    c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": None}})
+    assert "LLAMACPP_CTX_SIZE=262144" in env.read_text()  # back to default baseline
+
+
+def test_post_model_swap_updates_source(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "dense27b.gguf"}})
+    assert r.status_code == 200, r.text
+    assert "LLAMACPP_MODEL=dense27b.gguf" in env.read_text()
+    assert json.loads(reg.read_text())["models"]["local-chat"]["source"]["file"] == "dense27b.gguf"
+
+
+def test_post_rejects_missing_model_file(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "nope.gguf"}})
+    assert r.status_code == 400
+    assert calls == []

From 0ae3eaf4f001aa7d29e57acd30c9a0f47bdaf868 Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Tue, 23 Jun 2026 11:38:32 -0400
Subject: [PATCH 2/3] fix(ops-controller): copy llamacpp_flags.py in image; GET
 reflects deployed .env

Found during a local rebuild + validation:
- Dockerfile COPY missed llamacpp_flags.py -> ops-controller crash-looped at
  startup (FileNotFoundError). Added it to the COPY list.
- GET /model-config computed `effective` from the (possibly stale) registry
  record, so it could disagree with what's actually deployed (e.g. reported
  MTP off while the running .env had it on). It now reads the deployed .env as
  the source of truth for current state; an override = an effective value that
  differs from the baseline default.

Validated live: /model-config reports model=Qwen3.6-27B, ctx=262144, MTP on,
vision on; full dashboard proxy chain works.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ops-controller/Dockerfile |  2 +-
 ops-controller/main.py    | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/ops-controller/Dockerfile b/ops-controller/Dockerfile
index 39e1271..f788762 100644
--- a/ops-controller/Dockerfile
+++ b/ops-controller/Dockerfile
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certifi
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY main.py audit.py model_registry.py gpu_assignments_fmt.py ./
+COPY main.py audit.py model_registry.py gpu_assignments_fmt.py llamacpp_flags.py ./
 
 # Run as non-root user (docker group for socket access)
 RUN groupadd -g 999 docker && useradd -m -u 1000 -G docker appuser
diff --git a/ops-controller/main.py b/ops-controller/main.py
index 447df87..a6f287f 100644
--- a/ops-controller/main.py
+++ b/ops-controller/main.py
@@ -1290,18 +1290,25 @@ async def model_config_get(_: None = Depends(verify_token)):
     """Full model-control state for the dashboard: flag descriptors, defaults,
     active model, the active model's overrides, effective values, current .env,
     and on-disk model/mmproj lists."""
+    # Current state = the DEPLOYED .env (filled with defaults), so the UI always
+    # reflects what's actually running — not a possibly-stale registry record.
+    base = lf.defaults()
+    running = _read_env_values(lf.ENV_KEYS)
+    effective = dict(base)
+    effective.update(running)
     rec = _active_chat_record()
-    overrides = dict(rec.config) if rec else {}
-    if rec and rec.source.get("file") and "LLAMACPP_MODEL" not in overrides:
-        overrides["LLAMACPP_MODEL"] = rec.source["file"]
-    effective = lf.compute_effective(lf.defaults(), overrides)
+    if not effective.get("LLAMACPP_MODEL") and rec and rec.source.get("file"):
+        effective["LLAMACPP_MODEL"] = rec.source["file"]
+    # An "override" = an effective value that differs from the baseline default.
+    overrides = {k: effective[k] for k in lf.ENV_KEYS
+                 if k in effective and effective[k] != base.get(k, "")}
     return {
         "flags": lf.descriptors(),
-        "defaults": lf.defaults(),
+        "defaults": base,
         "active_model": effective.get("LLAMACPP_MODEL", ""),
         "overrides": overrides,
         "effective": lf.flag_view(effective),
-        "running": _read_env_values(lf.ENV_KEYS),
+        "running": running,
         "models": _list_ggufs(),
         "mmprojs": _list_ggufs(mmproj=True),
     }

From e4ce41d276947dac188e814b439d17de86f1e88b Mon Sep 17 00:00:00 2001
From: Hermes Bot <hermes@ordo-ai-stack.local>
Date: Tue, 23 Jun 2026 11:54:44 -0400
Subject: [PATCH 3/3] feat(model-config): per-flag help text (tooltips) in
 descriptors

Adds a HELP map + descriptors().help so the dashboard can show a one-line
explanation per llama.cpp flag. Test asserts every flag has help.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ops-controller/llamacpp_flags.py      | 24 ++++++++++++++++++++++++
 ops-controller/test_llamacpp_flags.py |  2 ++
 2 files changed, 26 insertions(+)

diff --git a/ops-controller/llamacpp_flags.py b/ops-controller/llamacpp_flags.py
index dccdb80..ac4c0dd 100644
--- a/ops-controller/llamacpp_flags.py
+++ b/ops-controller/llamacpp_flags.py
@@ -142,6 +142,29 @@ def defaults():
     "LLAMACPP_KV_CACHE_TYPE_V": sorted(_KV_TYPES),
 }
 
+# One-line explanations surfaced as tooltips in the dashboard flag UI.
+HELP = {
+    "LLAMACPP_MODEL": "The GGUF weights file llama.cpp loads as the chat model.",
+    "LLAMACPP_CTX_SIZE": "Context window in tokens. Stack-wide cap (Open WebUI, Cline, etc.); larger = more KV-cache VRAM.",
+    "LLAMACPP_GPU_LAYERS": "How many model layers to offload to the GPU. -1 = all on GPU.",
+    "LLAMACPP_ROPE_SCALING": "Method to stretch context beyond the model's native length. 'none' = native; 'yarn'/'linear' extend it.",
+    "LLAMACPP_ROPE_SCALE": "Context-extension factor used with rope scaling (e.g. 2 = double the native length).",
+    "LLAMACPP_YARN_ORIG_CTX": "The model's native (pre-extension) context length, for YaRN math. 0 = unset.",
+    "LLAMACPP_OVERRIDE_KV": "Override a GGUF metadata key as key=type:value (e.g. raise the declared context_length). Empty = none.",
+    "LLAMACPP_FLASH_ATTN": "Flash Attention. 'auto' lets llama.cpp decide; 'on' forces it (required by quantized KV cache).",
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "Quantize the KV cache to fit longer context in VRAM (1 = on).",
+    "LLAMACPP_KV_CACHE_TYPE_K": "KV-cache quantization for keys. q8_0 = best quality of the quantized set; smaller types save more VRAM.",
+    "LLAMACPP_KV_CACHE_TYPE_V": "KV-cache quantization for values. q8_0 = best quality; smaller types save more VRAM.",
+    "LLAMACPP_N_PREDICT": "Hard ceiling on tokens generated per request — a backstop against runaway generation.",
+    "LLAMACPP_REASONING_BUDGET": "Max tokens the model may spend inside <think>…</think> per response.",
+    "LLAMACPP_MMPROJ": "Vision projector (mmproj GGUF) that enables image input. Empty = text-only.",
+    "LLAMACPP_PARALLEL": "Number of concurrent request slots the server handles.",
+    "LLAMACPP_USE_MMAP": "Memory-map the model file. 0 = off (avoids stale page-cache on Docker bind mounts).",
+    "LLAMACPP_EXTRA_ARGS": "Raw llama-server flags appended verbatim — escape hatch for anything without a dedicated field.",
+    "MTP_ENABLED": "Multi-Token Prediction speculative decoding (~1.7× faster), using the model's built-in draft head.",
+    "MTP_N_MAX": "Max speculative draft tokens per step (1–6). Hardware-dependent; try a few values.",
+}
+
 
 def descriptors():
     """JSON-safe flag metadata for the dashboard to build its form (no callables)."""
@@ -153,6 +176,7 @@ def descriptors():
             "kind": d["kind"],
             "choices": CHOICES.get(key),
             "default": DEFAULTS.get(key),
+            "help": HELP.get(key),
         })
     return out
 
diff --git a/ops-controller/test_llamacpp_flags.py b/ops-controller/test_llamacpp_flags.py
index 27fcef3..f37115f 100644
--- a/ops-controller/test_llamacpp_flags.py
+++ b/ops-controller/test_llamacpp_flags.py
@@ -211,3 +211,5 @@ def test_descriptors_are_json_safe_and_cover_flags():
     rope = next(d for d in desc if d["key"] == "LLAMACPP_ROPE_SCALING")
     assert rope["choices"] == ["none", "linear", "yarn"]
     assert rope["kind"] == "enum"
+    # every flag carries a non-empty help string for the UI tooltip
+    assert all(d.get("help") for d in desc), [d["key"] for d in desc if not d.get("help")]