diff --git a/ops-controller/Dockerfile b/ops-controller/Dockerfile
index 39e1271..f788762 100644
--- a/ops-controller/Dockerfile
+++ b/ops-controller/Dockerfile
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certifi
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
-COPY main.py audit.py model_registry.py gpu_assignments_fmt.py ./
+COPY main.py audit.py model_registry.py gpu_assignments_fmt.py llamacpp_flags.py ./
 
 # Run as non-root user (docker group for socket access)
 RUN groupadd -g 999 docker && useradd -m -u 1000 -G docker appuser
diff --git a/ops-controller/llamacpp_flags.py b/ops-controller/llamacpp_flags.py
new file mode 100644
index 0000000..ac4c0dd
--- /dev/null
+++ b/ops-controller/llamacpp_flags.py
@@ -0,0 +1,261 @@
+"""Single source of truth for llama.cpp launch flags the dashboard/ops-controller
+may set, and how each is validated.
+
+Pure logic (no FastAPI/docker) so it is unit-testable and importable by both the
+API layer (validation + the env-key allowlist) and the render step. The dashboard
+fetches these descriptors to build its flag UI. MTP is exposed as two virtual
+flags (MTP_ENABLED / MTP_N_MAX) that render into LLAMACPP_EXTRA_ARGS.
+"""
+from __future__ import annotations
+
+import re
+
+# Mainline llama.cpp KV cache types (the fork-only tbq*/tbqp* are intentionally
+# excluded — they do not exist on the pinned ggml-org build).
+_KV_TYPES = {"q8_0", "q4_0", "q4_1", "q5_0", "q5_1", "iq4_nl", "f16"}
+# EXTRA_ARGS is word-split into argv (NOT shell-eval'd), but keep a strict
+# whitelist anyway as defense-in-depth against injection via the run script.
+_EXTRA_ARGS_RE = re.compile(r"^[a-zA-Z0-9 _.=:/-]*$")
+_OVERRIDE_KV_RE = re.compile(r"^[\w.]+=[a-z0-9]+:.+$")
+
+
+def _int(lo, hi):
+    def v(val):
+        try:
+            n = int(str(val).strip())
+        except (TypeError, ValueError):
+            return "must be an integer"
+        if n < lo or n > hi:
+            return f"must be between {lo} and {hi}"
+        return None
+    return v
+
+
+def _float_min(lo):
+    def v(val):
+        try:
+            f = float(str(val).strip())
+        except (TypeError, ValueError):
+            return "must be a number"
+        if f < lo:
+            return f"must be >= {lo}"
+        return None
+    return v
+
+
+def _enum(choices):
+    def v(val):
+        return None if str(val) in choices else f"must be one of {sorted(choices)}"
+    return v
+
+
+def _bool(val):
+    return None if str(val) in {"0", "1"} else "must be 0 or 1"
+
+
+def _override_kv(val):
+    s = str(val).strip()
+    if s == "":
+        return None
+    return (None if _OVERRIDE_KV_RE.match(s)
+            else "must be key=type:value (e.g. arch.context_length=int:524288) or empty")
+
+
+def _extra_args(val):
+    return (None if _EXTRA_ARGS_RE.match(str(val))
+            else "contains disallowed characters (allowed: letters, digits, space, and _ . = : / -)")
+
+
+def _gguf(val):
+    return None if str(val).strip().endswith(".gguf") else "must be a .gguf filename"
+
+
+def _gguf_or_empty(val):
+    s = str(val).strip()
+    return None if s == "" or s.endswith(".gguf") else "must be a .gguf path or empty"
+
+
+# key -> {group, kind, validate}. `kind` drives the UI input widget.
+FLAGS = {
+    "LLAMACPP_MODEL":                     {"group": "core",       "kind": "model",  "validate": _gguf},
+    "LLAMACPP_CTX_SIZE":                  {"group": "core",       "kind": "int",    "validate": _int(4096, 1048576)},
+    "LLAMACPP_GPU_LAYERS":                {"group": "core",       "kind": "int",    "validate": _int(-1, 1000)},
+    "LLAMACPP_ROPE_SCALING":              {"group": "context",    "kind": "enum",   "validate": _enum({"none", "linear", "yarn"})},
+    "LLAMACPP_ROPE_SCALE":                {"group": "context",    "kind": "float",  "validate": _float_min(1.0)},
+    "LLAMACPP_YARN_ORIG_CTX":             {"group": "context",    "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_OVERRIDE_KV":               {"group": "context",    "kind": "string", "validate": _override_kv},
+    "LLAMACPP_FLASH_ATTN":                {"group": "attention",  "kind": "enum",   "validate": _enum({"auto", "on", "off"})},
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": {"group": "attention", "kind": "bool", "validate": _bool},
+    "LLAMACPP_KV_CACHE_TYPE_K":           {"group": "attention",  "kind": "enum",   "validate": _enum(_KV_TYPES)},
+    "LLAMACPP_KV_CACHE_TYPE_V":           {"group": "attention",  "kind": "enum",   "validate": _enum(_KV_TYPES)},
+    "LLAMACPP_N_PREDICT":                 {"group": "gen",        "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_REASONING_BUDGET":          {"group": "gen",        "kind": "int",    "validate": _int(0, 1048576)},
+    "LLAMACPP_MMPROJ":                    {"group": "multimodal", "kind": "path",   "validate": _gguf_or_empty},
+    "LLAMACPP_PARALLEL":                  {"group": "advanced",   "kind": "int",    "validate": _int(1, 64)},
+    "LLAMACPP_USE_MMAP":                  {"group": "advanced",   "kind": "bool",   "validate": _bool},
+    "LLAMACPP_EXTRA_ARGS":                {"group": "advanced",   "kind": "string", "validate": _extra_args},
+}
+
+# Virtual UI flags — rendered into LLAMACPP_EXTRA_ARGS, never written as raw env keys.
+VIRTUAL = {
+    "MTP_ENABLED": {"group": "mtp", "kind": "bool", "validate": _bool},
+    "MTP_N_MAX":   {"group": "mtp", "kind": "int",  "validate": _int(1, 6)},
+}
+
+# The raw .env keys this module manages (excludes virtual flags).
+ENV_KEYS = set(FLAGS)
+
+# Baseline defaults (model-agnostic). effective(flag) = override if set else default.
+# Per-model specifics (a 512K model's ctx/rope, vision mmproj, MTP) live as the
+# model's overrides in the registry — NOT here.
+DEFAULTS = {
+    "LLAMACPP_MODEL": "",  # required — no sensible default; endpoint rejects empty
+    "LLAMACPP_CTX_SIZE": "262144",
+    "LLAMACPP_GPU_LAYERS": "-1",
+    "LLAMACPP_ROPE_SCALING": "none",
+    "LLAMACPP_ROPE_SCALE": "1",
+    "LLAMACPP_YARN_ORIG_CTX": "0",
+    "LLAMACPP_OVERRIDE_KV": "",
+    "LLAMACPP_FLASH_ATTN": "auto",
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "1",
+    "LLAMACPP_KV_CACHE_TYPE_K": "q8_0",
+    "LLAMACPP_KV_CACHE_TYPE_V": "q8_0",
+    "LLAMACPP_N_PREDICT": "65536",
+    "LLAMACPP_REASONING_BUDGET": "32768",
+    "LLAMACPP_MMPROJ": "",
+    "LLAMACPP_PARALLEL": "1",
+    "LLAMACPP_USE_MMAP": "0",
+    "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek",
+}
+
+
+def defaults():
+    """A fresh copy of the baseline defaults (one entry per managed env key)."""
+    return dict(DEFAULTS)
+
+
+# JSON-safe enum choices for the dashboard form (mirrors the validators above).
+CHOICES = {
+    "LLAMACPP_ROPE_SCALING": ["none", "linear", "yarn"],
+    "LLAMACPP_FLASH_ATTN": ["auto", "on", "off"],
+    "LLAMACPP_KV_CACHE_TYPE_K": sorted(_KV_TYPES),
+    "LLAMACPP_KV_CACHE_TYPE_V": sorted(_KV_TYPES),
+}
+
+# One-line explanations surfaced as tooltips in the dashboard flag UI.
+HELP = {
+    "LLAMACPP_MODEL": "The GGUF weights file llama.cpp loads as the chat model.",
+    "LLAMACPP_CTX_SIZE": "Context window in tokens. Stack-wide cap (Open WebUI, Cline, etc.); larger = more KV-cache VRAM.",
+    "LLAMACPP_GPU_LAYERS": "How many model layers to offload to the GPU. -1 = all on GPU.",
+    "LLAMACPP_ROPE_SCALING": "Method to stretch context beyond the model's native length. 'none' = native; 'yarn'/'linear' extend it.",
+    "LLAMACPP_ROPE_SCALE": "Context-extension factor used with rope scaling (e.g. 2 = double the native length).",
+    "LLAMACPP_YARN_ORIG_CTX": "The model's native (pre-extension) context length, for YaRN math. 0 = unset.",
+    "LLAMACPP_OVERRIDE_KV": "Override a GGUF metadata key as key=type:value (e.g. raise the declared context_length). Empty = none.",
+    "LLAMACPP_FLASH_ATTN": "Flash Attention. 'auto' lets llama.cpp decide; 'on' forces it (required by quantized KV cache).",
+    "LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION": "Quantize the KV cache to fit longer context in VRAM (1 = on).",
+    "LLAMACPP_KV_CACHE_TYPE_K": "KV-cache quantization for keys. q8_0 = best quality of the quantized set; smaller types save more VRAM.",
+    "LLAMACPP_KV_CACHE_TYPE_V": "KV-cache quantization for values. q8_0 = best quality; smaller types save more VRAM.",
+    "LLAMACPP_N_PREDICT": "Hard ceiling on tokens generated per request — a backstop against runaway generation.",
+    "LLAMACPP_REASONING_BUDGET": "Max tokens the model may spend inside <think>…</think> per response.",
+    "LLAMACPP_MMPROJ": "Vision projector (mmproj GGUF) that enables image input. Empty = text-only.",
+    "LLAMACPP_PARALLEL": "Number of concurrent request slots the server handles.",
+    "LLAMACPP_USE_MMAP": "Memory-map the model file. 0 = off (avoids stale page-cache on Docker bind mounts).",
+    "LLAMACPP_EXTRA_ARGS": "Raw llama-server flags appended verbatim — escape hatch for anything without a dedicated field.",
+    "MTP_ENABLED": "Multi-Token Prediction speculative decoding (~1.7× faster), using the model's built-in draft head.",
+    "MTP_N_MAX": "Max speculative draft tokens per step (1–6). Hardware-dependent; try a few values.",
+}
+
+
+def descriptors():
+    """JSON-safe flag metadata for the dashboard to build its form (no callables)."""
+    out = []
+    for key, d in {**FLAGS, **VIRTUAL}.items():
+        out.append({
+            "key": key,
+            "group": d["group"],
+            "kind": d["kind"],
+            "choices": CHOICES.get(key),
+            "default": DEFAULTS.get(key),
+            "help": HELP.get(key),
+        })
+    return out
+
+
+def validate(key, value):
+    """Return an error string if (key, value) is invalid, else None."""
+    desc = FLAGS.get(key) or VIRTUAL.get(key)
+    if desc is None:
+        return f"{key} is not a managed llama.cpp flag"
+    return desc["validate"](value)
+
+
+def validate_all(values):
+    """Return {key: error} for every invalid entry (empty dict = all valid)."""
+    return {k: e for k, v in values.items() if (e := validate(k, v)) is not None}
+
+
+def mtp_to_extra_args(enabled, n_max):
+    """Render the MTP virtual flags into the EXTRA_ARGS fragment."""
+    if not enabled:
+        return ""
+    return f"--spec-type draft-mtp --spec-draft-n-max {int(n_max)}"
+
+
+def parse_mtp_from_extra_args(extra):
+    """Inverse of mtp_to_extra_args: (enabled, n_max | None)."""
+    s = str(extra or "")
+    if "draft-mtp" not in s:
+        return (False, None)
+    m = re.search(r"--spec-draft-n-max\s+(\d+)", s)
+    return (True, int(m.group(1)) if m else None)
+
+
+def _strip_mtp_args(extra):
+    """Remove any MTP --spec-* tokens from an args string."""
+    s = re.sub(r"--spec-type\s+draft-mtp\b", "", str(extra or ""))
+    s = re.sub(r"--spec-draft-n-max\s+\d+", "", s)
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def compute_effective(baseline, overrides):
+    """Merge baseline `.env` values with a model's overrides into the effective raw
+    env dict.
+
+    `overrides` may contain virtual MTP flags (folded into EXTRA_ARGS) and `None`
+    values (clear -> inherit baseline). Returns only raw env keys.
+    """
+    eff = dict(baseline)
+    for k, v in overrides.items():
+        if k in VIRTUAL or v is None:
+            continue
+        eff[k] = str(v)
+
+    # MTP: a structured override wins; otherwise inherit whatever EXTRA_ARGS had.
+    base_enabled, base_n = parse_mtp_from_extra_args(eff.get("LLAMACPP_EXTRA_ARGS", ""))
+    o_enabled = overrides.get("MTP_ENABLED")
+    o_n = overrides.get("MTP_N_MAX")
+    enabled = (str(o_enabled) == "1") if o_enabled is not None else base_enabled
+    n_max = int(o_n) if o_n not in (None, "") else (base_n or 2)
+
+    stripped = _strip_mtp_args(eff.get("LLAMACPP_EXTRA_ARGS", ""))
+    frag = mtp_to_extra_args(enabled, n_max)
+    eff["LLAMACPP_EXTRA_ARGS"] = f"{stripped} {frag}".strip() if frag else stripped
+    return eff
+
+
+def render_env_file(effective, header="# generated by ops-controller — do not hand-edit"):
+    """Render the effective config to override-env-file text (managed keys only)."""
+    lines = [header]
+    for key in sorted(ENV_KEYS):
+        if key in effective:
+            lines.append(f"{key}={effective[key]}")
+    return "\n".join(lines) + "\n"
+
+
+def flag_view(effective):
+    """UI-facing view: raw env values plus the derived virtual MTP flags."""
+    view = {k: v for k, v in effective.items() if k in ENV_KEYS}
+    enabled, n_max = parse_mtp_from_extra_args(effective.get("LLAMACPP_EXTRA_ARGS", ""))
+    view["MTP_ENABLED"] = "1" if enabled else "0"
+    view["MTP_N_MAX"] = str(n_max if n_max else 2)
+    return view
diff --git a/ops-controller/main.py b/ops-controller/main.py
index 205740f..a6f287f 100644
--- a/ops-controller/main.py
+++ b/ops-controller/main.py
@@ -49,6 +49,16 @@
     model_registry = _ilu.module_from_spec(_mr_spec)
     _mr_spec.loader.exec_module(model_registry)
 
+try:
+    import llamacpp_flags as lf
+except ModuleNotFoundError:  # pragma: no cover
+    import importlib.util as _ilu
+    _lf_spec = _ilu.spec_from_file_location(
+        "llamacpp_flags", str(Path(__file__).resolve().parent / "llamacpp_flags.py"),
+    )
+    lf = _ilu.module_from_spec(_lf_spec)
+    _lf_spec.loader.exec_module(lf)
+
 app = FastAPI(title="Ops Controller", version="1.0.0")
 logger = logging.getLogger(__name__)
 
@@ -80,6 +90,10 @@
 
 BASE_PATH = os.environ.get("BASE_PATH", ".")
 COMPOSE_FILE_ENV = os.environ.get("COMPOSE_FILE", "docker-compose.yml")
+# On-disk GGUF directory (chat models + mmproj) shown in the model-config UI.
+MODELS_DIR = Path(os.environ.get("LLAMACPP_MODELS_DIR", "/workspace/models/gguf"))
+# Services that template LLAMACPP_CTX_SIZE and must also recreate when ctx changes.
+MODEL_CONFIG_CTX_CONSUMERS = ["model-gateway"]
 
 # Services whose GPU pin the dashboard may change.
 GPU_ASSIGNABLE_SERVICES = {"llamacpp", "llamacpp-embed", "comfyui", "stt", "tts"}
@@ -227,6 +241,65 @@ def _set_env_keys(kv: dict, request=None) -> None:
     _write_text_atomic(env_path, content)
 
 
+# ---------------------------------------------------------------------------
+# Model-config control plane (dashboard) — registry overrides -> .env -> recreate
+# ---------------------------------------------------------------------------
+
+def _active_chat_record():
+    """The enabled single-model llamacpp (chat) registry record, or None."""
+    for rec in REGISTRY.list_models().values():
+        if rec.service == "llamacpp" and rec.runtime == "single-model" and rec.enabled:
+            return rec
+    return None
+
+
+def _read_env_values(keys):
+    """Current values for `keys` from the active (uncommented) .env lines."""
+    env_path = REGISTRY.env_path
+    out = {}
+    if not env_path.exists():
+        return out
+    content = env_path.read_text(encoding="utf-8")
+    for key in keys:
+        m = re.search(rf"^{re.escape(key)}=(.*)$", content, re.MULTILINE)
+        if m:
+            v = m.group(1).strip()
+            if len(v) >= 2 and v[0] == v[-1] and v[0] in "\"'":
+                v = v[1:-1]
+            out[key] = v
+    return out
+
+
+def _render_model_config_to_env(effective):
+    """Upsert every managed flag into .env in place. The ^KEY= anchor updates only
+    the active line, so commented presets in the MODEL CONFIGS block survive."""
+    env_path = REGISTRY.env_path
+    content = env_path.read_text(encoding="utf-8") if env_path.exists() else ""
+    for key in sorted(lf.ENV_KEYS):
+        if key not in effective:
+            continue
+        val = str(effective[key])
+        if "\n" in val or "\r" in val:
+            raise HTTPException(status_code=400, detail=f"Illegal newline in {key}")
+        pattern = rf"^{re.escape(key)}=.*"
+        if re.search(pattern, content, re.MULTILINE):
+            content = re.sub(pattern, f"{key}={val}", content, flags=re.MULTILINE)
+        else:
+            content = content.rstrip("\n") + f"\n{key}={val}\n"
+    _write_text_atomic(env_path, content)
+
+
+def _list_ggufs(mmproj=False):
+    """On-disk GGUF basenames. mmproj=True -> only mmproj-* files; else chat models."""
+    try:
+        names = sorted(p.name for p in MODELS_DIR.glob("*.gguf"))
+    except OSError:
+        return []
+    if mmproj:
+        return [n for n in names if n.startswith("mmproj")]
+    return [n for n in names if not n.startswith("mmproj") and "embed" not in n.lower()]
+
+
 def _live_gpus() -> dict:
     """Return {uuid: {"name", "total_gb", "used_gb", "util"}} via nvidia-smi.
 
@@ -1206,6 +1279,101 @@ async def env_get(key: str, _: None = Depends(verify_token)):
     return {"key": key, "value": raw}
 
 
+class ModelConfigBody(BaseModel):
+    overrides: dict = Field(default_factory=dict)
+    confirm: bool = False
+    dry_run: bool = False
+
+
+@app.get("/model-config")
+async def model_config_get(_: None = Depends(verify_token)):
+    """Full model-control state for the dashboard: flag descriptors, defaults,
+    active model, the active model's overrides, effective values, current .env,
+    and on-disk model/mmproj lists."""
+    # Current state = the DEPLOYED .env (filled with defaults), so the UI always
+    # reflects what's actually running — not a possibly-stale registry record.
+    base = lf.defaults()
+    running = _read_env_values(lf.ENV_KEYS)
+    effective = dict(base)
+    effective.update(running)
+    rec = _active_chat_record()
+    if not effective.get("LLAMACPP_MODEL") and rec and rec.source.get("file"):
+        effective["LLAMACPP_MODEL"] = rec.source["file"]
+    # An "override" = an effective value that differs from the baseline default.
+    overrides = {k: effective[k] for k in lf.ENV_KEYS
+                 if k in effective and effective[k] != base.get(k, "")}
+    return {
+        "flags": lf.descriptors(),
+        "defaults": base,
+        "active_model": effective.get("LLAMACPP_MODEL", ""),
+        "overrides": overrides,
+        "effective": lf.flag_view(effective),
+        "running": running,
+        "models": _list_ggufs(),
+        "mmprojs": _list_ggufs(mmproj=True),
+    }
+
+
+@app.post("/model-config")
+async def model_config_post(body: ModelConfigBody, request: Request,
+                            _: None = Depends(verify_token)):
+    """Validate + apply model-config overrides via the ONE write path: persist to
+    the registry, render into .env, recreate llamacpp (+ ctx consumers)."""
+    errs = lf.validate_all({k: v for k, v in body.overrides.items() if v is not None})
+    if errs:
+        raise HTTPException(status_code=400, detail={"validation": errs})
+    if body.dry_run:
+        return {"would": "apply", "overrides": body.overrides}
+    if not body.confirm:
+        raise HTTPException(status_code=400, detail="Set {\"confirm\": true} to apply.")
+
+    rec = _active_chat_record()
+    if rec is None:
+        raise HTTPException(status_code=404, detail="No active single-model llamacpp record")
+
+    config = dict(rec.config)
+    source_file = rec.source.get("file", "")
+    ctx_touched = False
+    for k, v in body.overrides.items():
+        if k == "LLAMACPP_MODEL":
+            if v:
+                source_file = str(v)
+            config.pop("LLAMACPP_MODEL", None)
+            continue
+        if k == "LLAMACPP_CTX_SIZE":
+            ctx_touched = True
+        if v is None:
+            config.pop(k, None)
+        else:
+            config[k] = str(v)
+
+    overrides = dict(config)
+    if source_file:
+        overrides["LLAMACPP_MODEL"] = source_file
+    effective = lf.compute_effective(lf.defaults(), overrides)
+
+    model = effective.get("LLAMACPP_MODEL", "")
+    if not model:
+        raise HTTPException(status_code=400, detail="A model file must be set")
+    if not (MODELS_DIR / model).exists():
+        raise HTTPException(status_code=400, detail=f"Model file not found: {model}")
+
+    _render_model_config_to_env(effective)
+    rec.config = config
+    rec.source = {**rec.source, "file": source_file}
+    rec.updated_by = "model-config"
+    REGISTRY.upsert(rec)
+
+    services = ["llamacpp"] + (MODEL_CONFIG_CTX_CONSUMERS if ctx_touched else [])
+    for svc in services:
+        _recreate_service(svc, request)
+
+    _audit("model_config", model, "ok", f"keys={sorted(body.overrides)}",
+           correlation_id=_correlation_id(request))
+    return {"ok": True, "active_model": model,
+            "effective": lf.flag_view(effective), "recreated": services}
+
+
 @app.post("/services/{service_id}/recreate")
 async def service_recreate(
     service_id: str, body: ConfirmBody, request: Request,
diff --git a/ops-controller/test_llamacpp_flags.py b/ops-controller/test_llamacpp_flags.py
new file mode 100644
index 0000000..f37115f
--- /dev/null
+++ b/ops-controller/test_llamacpp_flags.py
@@ -0,0 +1,215 @@
+"""Flag-schema validation tests (pure; no FastAPI/docker).
+
+llamacpp_flags is the single source of truth for which llama.cpp launch knobs the
+dashboard/ops-controller may set and how each is validated. Drives API validation,
+the env-key allowlist, and MTP<->extra_args rendering.
+"""
+from __future__ import annotations
+
+import importlib.util
+from pathlib import Path
+
+_PATH = Path(__file__).resolve().parent / "llamacpp_flags.py"
+_spec = importlib.util.spec_from_file_location("llamacpp_flags_under_test", _PATH)
+lf = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(lf)
+
+
+# --- enum flags ---
+def test_enum_accepts_valid():
+    assert lf.validate("LLAMACPP_ROPE_SCALING", "yarn") is None
+    assert lf.validate("LLAMACPP_FLASH_ATTN", "auto") is None
+    assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "q8_0") is None
+
+
+def test_enum_rejects_invalid():
+    assert lf.validate("LLAMACPP_ROPE_SCALING", "bogus") is not None
+    assert lf.validate("LLAMACPP_FLASH_ATTN", "maybe") is not None
+    # fork-only TurboQuant KV type is NOT valid on the pinned mainline build
+    assert lf.validate("LLAMACPP_KV_CACHE_TYPE_K", "tbq3_0") is not None
+
+
+# --- int flags with range ---
+def test_int_in_range_ok():
+    assert lf.validate("LLAMACPP_CTX_SIZE", "262144") is None
+    assert lf.validate("LLAMACPP_CTX_SIZE", 262144) is None
+
+
+def test_int_out_of_range_or_nonint():
+    assert lf.validate("LLAMACPP_CTX_SIZE", "1000") is not None        # below min
+    assert lf.validate("LLAMACPP_CTX_SIZE", "99999999") is not None    # above max
+    assert lf.validate("LLAMACPP_CTX_SIZE", "notanint") is not None
+
+
+def test_gpu_layers_allows_negative_one():
+    assert lf.validate("LLAMACPP_GPU_LAYERS", "-1") is None
+
+
+# --- float flag ---
+def test_rope_scale_float():
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "2") is None
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "1.5") is None
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "0.5") is not None   # must be >= 1
+    assert lf.validate("LLAMACPP_ROPE_SCALE", "abc") is not None
+
+
+# --- bool flag ---
+def test_bool_flag():
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "1") is None
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "0") is None
+    assert lf.validate("LLAMACPP_ENABLE_KV_CACHE_QUANTIZATION", "2") is not None
+
+
+# --- override_kv format ---
+def test_override_kv_format():
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "") is None  # empty = unset
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "qwen35moe.context_length=int:524288") is None
+    assert lf.validate("LLAMACPP_OVERRIDE_KV", "garbage") is not None
+
+
+# --- extra_args whitelist (anti shell-injection) ---
+def test_extra_args_whitelist():
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "--spec-type draft-mtp --spec-draft-n-max 2") is None
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "--reasoning-format deepseek") is None
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "$(evil)") is not None      # shell metachars
+    assert lf.validate("LLAMACPP_EXTRA_ARGS", "a; rm -rf x") is not None  # semicolon
+
+
+# --- MTP first-class flags ---
+def test_mtp_n_max_range():
+    assert lf.validate("MTP_N_MAX", "2") is None
+    assert lf.validate("MTP_N_MAX", "0") is not None
+    assert lf.validate("MTP_N_MAX", "9") is not None
+
+
+def test_mtp_enabled_bool():
+    assert lf.validate("MTP_ENABLED", "1") is None
+    assert lf.validate("MTP_ENABLED", "nope") is not None
+
+
+# --- unknown key ---
+def test_unknown_key_rejected():
+    assert lf.validate("LLAMACPP_NOT_A_FLAG", "x") is not None
+
+
+# --- ENV_KEYS exposes the managed set ---
+def test_env_keys_set():
+    assert "LLAMACPP_MODEL" in lf.ENV_KEYS
+    assert "LLAMACPP_CTX_SIZE" in lf.ENV_KEYS
+    assert "LLAMACPP_OVERRIDE_KV" in lf.ENV_KEYS
+    # MTP_* are virtual (rendered into EXTRA_ARGS), not raw env keys
+    assert "MTP_ENABLED" not in lf.ENV_KEYS
+
+
+# --- validate_all ---
+def test_validate_all_reports_only_invalid():
+    errs = lf.validate_all({
+        "LLAMACPP_CTX_SIZE": "262144",        # ok
+        "LLAMACPP_ROPE_SCALING": "bogus",     # bad
+        "LLAMACPP_FLASH_ATTN": "off",         # ok
+    })
+    assert set(errs) == {"LLAMACPP_ROPE_SCALING"}
+
+
+# --- MTP <-> extra_args round-trip ---
+def test_mtp_renders_into_extra_args():
+    frag = lf.mtp_to_extra_args(True, 3)
+    assert "--spec-type" in frag and "draft-mtp" in frag and "3" in frag
+
+
+def test_mtp_disabled_renders_empty():
+    assert lf.mtp_to_extra_args(False, 2).strip() == ""
+
+
+def test_mtp_parsed_from_extra_args():
+    enabled, n_max = lf.parse_mtp_from_extra_args("--spec-type draft-mtp --spec-draft-n-max 4 --reasoning-format deepseek")
+    assert enabled is True
+    assert n_max == 4
+
+
+def test_mtp_parse_absent():
+    enabled, n_max = lf.parse_mtp_from_extra_args("--reasoning-format deepseek")
+    assert enabled is False
+
+
+# --- compute_effective (baseline + overrides) + render ---
+BASELINE = {
+    "LLAMACPP_MODEL": "base.gguf",
+    "LLAMACPP_CTX_SIZE": "262144",
+    "LLAMACPP_ROPE_SCALING": "none",
+    "LLAMACPP_EXTRA_ARGS": "--reasoning-format deepseek",
+    "LLAMACPP_KV_CACHE_TYPE_K": "q8_0",
+}
+
+
+def test_effective_override_wins():
+    eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": "524288"})
+    assert eff["LLAMACPP_CTX_SIZE"] == "524288"
+    assert eff["LLAMACPP_ROPE_SCALING"] == "none"  # inherited from baseline
+
+
+def test_effective_none_clears_to_baseline():
+    eff = lf.compute_effective(BASELINE, {"LLAMACPP_CTX_SIZE": None})
+    assert eff["LLAMACPP_CTX_SIZE"] == "262144"
+
+
+def test_effective_mtp_enabled_folds_into_extra_args():
+    eff = lf.compute_effective(BASELINE, {"MTP_ENABLED": "1", "MTP_N_MAX": "3"})
+    ex = eff["LLAMACPP_EXTRA_ARGS"]
+    assert "--reasoning-format deepseek" in ex
+    assert "--spec-type draft-mtp" in ex and "--spec-draft-n-max 3" in ex
+
+
+def test_effective_mtp_disabled_strips_spec_args():
+    base = dict(BASELINE,
+                LLAMACPP_EXTRA_ARGS="--spec-type draft-mtp --spec-draft-n-max 2 --reasoning-format deepseek")
+    eff = lf.compute_effective(base, {"MTP_ENABLED": "0"})
+    assert "draft-mtp" not in eff["LLAMACPP_EXTRA_ARGS"]
+    assert "--reasoning-format deepseek" in eff["LLAMACPP_EXTRA_ARGS"]
+
+
+def test_render_env_file_only_managed_keys():
+    eff = lf.compute_effective(BASELINE, {})
+    text = lf.render_env_file(eff)
+    assert "LLAMACPP_CTX_SIZE=262144" in text
+    assert "MTP_ENABLED" not in text  # virtual flag is not a raw env key
+    parsed = dict(line.split("=", 1) for line in text.strip().splitlines()
+                  if "=" in line and not line.startswith("#"))
+    assert parsed["LLAMACPP_MODEL"] == "base.gguf"
+
+
+def test_overrides_for_model_extracts_mtp_virtuals():
+    # given an effective EXTRA_ARGS with MTP, the UI-facing view exposes the virtuals
+    view = lf.flag_view({"LLAMACPP_EXTRA_ARGS": "--spec-type draft-mtp --spec-draft-n-max 4"})
+    assert view["MTP_ENABLED"] == "1"
+    assert view["MTP_N_MAX"] == "4"
+
+
+def test_defaults_cover_every_managed_key():
+    d = lf.defaults()
+    assert set(d) == lf.ENV_KEYS  # one default per managed flag, no extras
+    # every default value is itself valid
+    assert lf.validate_all({k: v for k, v in d.items() if k != "LLAMACPP_MODEL"}) == {}
+
+
+def test_reset_to_default_via_effective():
+    # an override sets ctx high; clearing it (None) falls back to the default baseline
+    base = lf.defaults()
+    eff = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": "524288"})
+    assert eff["LLAMACPP_CTX_SIZE"] == "524288"
+    eff2 = lf.compute_effective(base, {"LLAMACPP_CTX_SIZE": None})
+    assert eff2["LLAMACPP_CTX_SIZE"] == "262144"  # default
+
+
+def test_descriptors_are_json_safe_and_cover_flags():
+    import json
+    desc = lf.descriptors()
+    json.dumps(desc)  # must be serializable (no callables)
+    keys = {d["key"] for d in desc}
+    assert lf.ENV_KEYS <= keys
+    assert {"MTP_ENABLED", "MTP_N_MAX"} <= keys
+    rope = next(d for d in desc if d["key"] == "LLAMACPP_ROPE_SCALING")
+    assert rope["choices"] == ["none", "linear", "yarn"]
+    assert rope["kind"] == "enum"
+    # every flag carries a non-empty help string for the UI tooltip
+    assert all(d.get("help") for d in desc), [d["key"] for d in desc if not d.get("help")]
diff --git a/ops-controller/test_model_config_endpoint.py b/ops-controller/test_model_config_endpoint.py
new file mode 100644
index 0000000..5c2eb88
--- /dev/null
+++ b/ops-controller/test_model_config_endpoint.py
@@ -0,0 +1,120 @@
+"""Endpoint tests for GET/POST /model-config (the dashboard control-plane API).
+
+Uses temp .env + registry + models dir, reloads main against them, and stubs
+_recreate_service so no docker runs.
+"""
+import importlib
+import json
+
+import pytest
+from fastapi.testclient import TestClient
+
+TOKEN = "test-token-for-test"
+AUTH = {"Authorization": f"Bearer {TOKEN}"}
+
+
+@pytest.fixture
+def app_env(tmp_path, monkeypatch):
+    env = tmp_path / ".env"
+    env.write_text(
+        "# === MODEL CONFIGS ===\n"
+        "LLAMACPP_MODEL=base.gguf\n"
+        "LLAMACPP_CTX_SIZE=262144\n"
+        "LLAMACPP_ROPE_SCALING=none\n"
+        "LLAMACPP_EXTRA_ARGS=--reasoning-format deepseek\n"
+        "# LLAMACPP_MODEL=preset-a3b.gguf\n",  # commented preset MUST survive edits
+        encoding="utf-8",
+    )
+    reg = tmp_path / "registry.json"
+    reg.write_text(json.dumps({"version": 1, "models": {
+        "local-chat": {"id": "local-chat", "kind": "chat", "service": "llamacpp",
+                       "runtime": "single-model", "enabled": True,
+                       "source": {"file": "base.gguf"}, "config": {},
+                       "gpu_uuid": None, "est_vram_gb": 0.0,
+                       "updated_by": "test", "updated_at": None}}}), encoding="utf-8")
+    models = tmp_path / "gguf"
+    models.mkdir()
+    (models / "base.gguf").write_bytes(b"x")
+    (models / "dense27b.gguf").write_bytes(b"x")
+
+    monkeypatch.setenv("OPS_CONTROLLER_TOKEN", TOKEN)
+    monkeypatch.setenv("OPS_ENV_PATH", str(env))
+    monkeypatch.setenv("MODEL_REGISTRY_PATH", str(reg))
+    monkeypatch.setenv("LLAMACPP_MODELS_DIR", str(models))
+
+    import ops_controller.main as m
+    importlib.reload(m)
+    calls = []
+    monkeypatch.setattr(m, "_recreate_service",
+                        lambda svc, request=None: (calls.append(svc), {"ok": True, "service": svc})[1])
+    return m, env, reg, calls
+
+
+def test_get_model_config(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).get("/model-config", headers=AUTH)
+    assert r.status_code == 200, r.text
+    b = r.json()
+    assert b["active_model"] == "base.gguf"
+    assert any(d["key"] == "LLAMACPP_CTX_SIZE" for d in b["flags"])
+    assert "base.gguf" in b["models"] and "dense27b.gguf" in b["models"]
+    assert b["effective"]["LLAMACPP_CTX_SIZE"] == "262144"
+
+
+def test_get_requires_auth(app_env):
+    m, env, reg, calls = app_env
+    assert TestClient(m.app).get("/model-config").status_code in (401, 403)
+
+
+def test_post_sets_override_and_recreates(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    assert r.status_code == 200, r.text
+    txt = env.read_text()
+    assert "LLAMACPP_CTX_SIZE=524288" in txt
+    assert "# LLAMACPP_MODEL=preset-a3b.gguf" in txt  # commented preset preserved
+    assert "llamacpp" in calls
+    cfg = json.loads(reg.read_text())["models"]["local-chat"]["config"]
+    assert cfg.get("LLAMACPP_CTX_SIZE") == "524288"
+
+
+def test_post_validation_400_no_recreate(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_ROPE_SCALING": "bogus"}})
+    assert r.status_code == 400
+    assert "LLAMACPP_ROPE_SCALING" in r.text
+    assert calls == []
+
+
+def test_post_requires_confirm(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    assert r.status_code == 400
+
+
+def test_post_clear_reverts_to_default(app_env):
+    m, env, reg, calls = app_env
+    c = TestClient(m.app)
+    c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": "524288"}})
+    c.post("/model-config", headers=AUTH, json={"confirm": True, "overrides": {"LLAMACPP_CTX_SIZE": None}})
+    assert "LLAMACPP_CTX_SIZE=262144" in env.read_text()  # back to default baseline
+
+
+def test_post_model_swap_updates_source(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "dense27b.gguf"}})
+    assert r.status_code == 200, r.text
+    assert "LLAMACPP_MODEL=dense27b.gguf" in env.read_text()
+    assert json.loads(reg.read_text())["models"]["local-chat"]["source"]["file"] == "dense27b.gguf"
+
+
+def test_post_rejects_missing_model_file(app_env):
+    m, env, reg, calls = app_env
+    r = TestClient(m.app).post("/model-config", headers=AUTH,
+                               json={"confirm": True, "overrides": {"LLAMACPP_MODEL": "nope.gguf"}})
+    assert r.status_code == 400
+    assert calls == []