networmix
diff --git a/‎netlab/autoresearch/cli.py‎
Lines changed: 50 additions & 7 deletions b/‎netlab/autoresearch/cli.py‎
Lines changed: 50 additions & 7 deletions
diff --git a/‎netlab/autoresearch/generation_loop.py‎
Lines changed: 95 additions & 8 deletions b/‎netlab/autoresearch/generation_loop.py‎
Lines changed: 95 additions & 8 deletions
@@ -10,6 +10,7 @@
 
 import argparse
 import logging
+import os
 import shutil
 import sys
 import textwrap
@@ -18,6 +19,10 @@
 import yaml
 
 from netlab.autoresearch.backend import (
+    DEFAULT_CLAUDE_MODEL,
+    DEFAULT_CODEX_MODEL,
+    DEFAULT_OPENAI_MODEL,
+    SUPPORTED_BACKENDS,
     ClaudeCLIBackend,
     CodexCLIBackend,
     LLMBackend,
@@ -96,33 +101,70 @@ def _build_default_template(base_scenario_path: Path) -> str:
 def _build_backend(args: argparse.Namespace) -> LLMBackend:
     """Construct an LLM backend from CLI arguments."""
     backend_name: str = args.backend
+    backend_bin = getattr(args, "backend_bin", None)
 
     if backend_name == "mock":
         return _build_mock_backend(args)
     elif backend_name == "claude-cli":
-        return ClaudeCLIBackend()
+        model = _resolve_model_arg(
+            args,
+            generic_attr="model",
+            specific_attr="claude_model",
+            env_var="CLAUDE_MODEL",
+            default=DEFAULT_CLAUDE_MODEL,
+        )
+        return ClaudeCLIBackend(model=model, command=backend_bin)
     elif backend_name == "codex-cli":
-        return CodexCLIBackend()
+        model = _resolve_model_arg(
+            args,
+            generic_attr="model",
+            specific_attr="codex_model",
+            env_var="CODEX_MODEL",
+            default=DEFAULT_CODEX_MODEL,
+        )
+        return CodexCLIBackend(model=model, command=backend_bin)
     elif backend_name == "openai":
-        import os
-
         base_url = getattr(args, "openai_base_url", None) or os.environ.get(
             "OPENAI_BASE_URL", "https://api.openai.com"
         )
-        model = getattr(args, "openai_model", None) or os.environ.get(
-            "OPENAI_MODEL", "gpt-4"
+        model = _resolve_model_arg(
+            args,
+            generic_attr="model",
+            specific_attr="openai_model",
+            env_var="OPENAI_MODEL",
+            default=DEFAULT_OPENAI_MODEL,
         )
         api_key = os.environ.get("OPENAI_API_KEY", "")
         return OpenAICompatibleBackend(base_url=base_url, model=model, api_key=api_key)
     else:
         print(
             f"Unknown backend: {backend_name!r}. "
-            "Use 'mock', 'claude-cli', 'codex-cli', or 'openai'.",
+            f"Use {', '.join(repr(name) for name in SUPPORTED_BACKENDS)}.",
             file=sys.stderr,
         )
         sys.exit(1)
 
 
+def _resolve_model_arg(
+    args: argparse.Namespace,
+    *,
+    generic_attr: str,
+    specific_attr: str,
+    env_var: str,
+    default: str,
+) -> str:
+    """Resolve model from generic CLI flag, backend-specific flag, env, then default."""
+    generic_value = getattr(args, generic_attr, None)
+    if generic_value:
+        return generic_value
+
+    specific_value = getattr(args, specific_attr, None)
+    if specific_value:
+        return specific_value
+
+    return os.environ.get(env_var, default)
+
+
 def _build_mock_backend(args: argparse.Namespace) -> MockBackend:
     """Build a MockBackend that generates plausible YAML responses.
 
@@ -292,6 +334,7 @@ def autoresearch_run(args: argparse.Namespace) -> None:
     config = RunConfig(
         project_dir=project_dir,
         backend=backend,
+        ngraph_bin=getattr(args, "ngraph_bin", None),
         max_experiments=args.max_experiments,
         timeout_s=args.timeout,
         seed=args.seed,
 
@@ -22,6 +22,8 @@
 
 import yaml
 
+from netlab.runtime import require_executable
+
 from .backend import LLMBackend
 
 
@@ -51,11 +53,17 @@ def summary(self) -> str:
 
 @dataclass
 class GenerationResult:
-    """Output of the generation loop."""
+    """Output of the generation loop.
+
+    On success, contains both the validated scenario and the simulation
+    results (since the simulation is run as part of validation).
+    """
 
     success: bool
     scenario_yaml: str = ""
     scenario_path: Path | None = None
+    results_path: Path | None = None
+    results_data: dict | None = None
     inspect: InspectResult | None = None
     iterations_used: int = 0
     error: str = ""
@@ -256,7 +264,7 @@ def _get_generation_system_prompt() -> str:
 """
 
 _REVISION_PROMPT_TEMPLATE = """\
-The scenario you generated was inspected by ngraph. Here is the result:
+The scenario you generated failed validation:
 
 {inspect_summary}
 
@@ -265,7 +273,13 @@ def _get_generation_system_prompt() -> str:
 
 {validation_errors}
 
-Please fix the scenario YAML to match the intent. Return ONLY the YAML content.
+Common issues:
+- Demand source/target regex must match existing node names
+- Failure rule mode must be "choice" (not "random") with count: N
+- All nodes referenced in links must be defined in the nodes section
+- WorkflowType is TrafficMatrixPlacement (not TrafficMatrixPerformance)
+
+Fix the scenario YAML. Return ONLY the YAML content.
 """
 
 
@@ -292,11 +306,10 @@ def run_generation_loop(
         GenerationResult with the validated scenario or error details.
     """
     if ngraph_bin is None:
-        ngraph_bin = shutil.which("ngraph")
-        if ngraph_bin is None:
-            return GenerationResult(
-                success=False, error="ngraph binary not found on PATH"
-            )
+        try:
+            ngraph_bin = require_executable("ngraph", env_var="NETLAB_NGRAPH_BIN")
+        except RuntimeError as exc:
+            return GenerationResult(success=False, error=str(exc))
 
     cleanup_work_dir = False
     if work_dir is None:
@@ -359,10 +372,24 @@ def run_generation_loop(
                     last_inspect.errors = viability_errors
                     continue
 
+                # Run simulation as definitive validation.
+                # For LLM-generated scenarios (10-20 nodes), this takes <1s.
+                # Catches issues inspect misses: unresolved demand patterns,
+                # invalid failure policies, workflow reference errors.
+                sim_result = _run_simulation(scenario_path, ngraph_bin, work_dir)
+                if not sim_result.success:
+                    last_inspect = InspectResult(
+                        success=False,
+                        errors=[f"Simulation failed: {sim_result.error}"],
+                    )
+                    continue
+
                 return GenerationResult(
                     success=True,
                     scenario_yaml=yaml_text,
                     scenario_path=scenario_path,
+                    results_path=sim_result.results_path,
+                    results_data=sim_result.results_data,
                     inspect=last_inspect,
                     iterations_used=iteration + 1,
                 )
@@ -381,6 +408,66 @@ def run_generation_loop(
             shutil.rmtree(work_dir, ignore_errors=True)
 
 
+@dataclass
+class _SimResult:
+    """Internal result from a trial simulation run."""
+
+    success: bool
+    results_path: Path | None = None
+    results_data: dict | None = None
+    error: str = ""
+
+
+def _run_simulation(scenario_path: Path, ngraph_bin: str, work_dir: Path) -> _SimResult:
+    """Run ngraph on the scenario as a validation step.
+
+    Returns the results if successful, or an error message if not.
+    Timeout is short (60s) since LLM-generated scenarios are small.
+    """
+    import json
+
+    results_dir = work_dir / "results"
+    results_dir.mkdir(exist_ok=True)
+    try:
+        proc = subprocess.run(
+            [ngraph_bin, "run", str(scenario_path), "-o", str(results_dir)],
+            capture_output=True,
+            text=True,
+            timeout=60,
+        )
+    except subprocess.TimeoutExpired:
+        return _SimResult(success=False, error="Simulation timed out (60s)")
+
+    if proc.returncode != 0:
+        # Extract useful error from stderr
+        stderr = proc.stderr.strip()
+        error_lines = [
+            line
+            for line in stderr.splitlines()
+            if "error" in line.lower() or "Error" in line
+        ]
+        error_msg = "; ".join(error_lines[-3:]) if error_lines else stderr[-300:]
+        return _SimResult(success=False, error=error_msg)
+
+    # Find and load results
+    results_files = list(results_dir.glob("*.results.json"))
+    if not results_files:
+        return _SimResult(success=False, error="No results file produced")
+
+    results_path = results_files[0]
+    try:
+        with results_path.open() as f:
+            results_data = json.load(f)
+    except (json.JSONDecodeError, OSError) as e:
+        return _SimResult(success=False, error=f"Failed to load results: {e}")
+
+    return _SimResult(
+        success=True,
+        results_path=results_path,
+        results_data=results_data,
+    )
+
+
 def _check_viability(inspect: InspectResult) -> list[str]:
     """Check that an inspected scenario is minimally viable.