Skip to content

Commit 5c3a1cd

Browse files
Andrey Golovanovclaude
andcommitted
Run simulation inside generation loop — iterate until scenario RUNS, not just passes inspect
The generation loop previously declared success when ngraph inspect passed, but simulation could still fail (unresolved demand patterns, invalid failure modes). Qwen3-235B produced a scenario where demands referenced nodes that didn't exist — inspect passed but ngraph run crashed. Now the loop runs ngraph as the final validation step. If simulation fails, the error feeds back to the LLM for another attempt. The loop iterates until the scenario both inspects AND runs successfully. This means GenerationResult now includes results_data — the outer loop no longer runs simulation separately. For LLM-generated scenarios (10-20 nodes), simulation takes <1s, negligible vs the LLM call. Also fixed: OpenAI backend double /v1 in URL construction. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 45867ac commit 5c3a1cd

13 files changed

Lines changed: 487 additions & 144 deletions

netlab/autoresearch/cli.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import argparse
1212
import logging
13+
import os
1314
import shutil
1415
import sys
1516
import textwrap
@@ -18,6 +19,10 @@
1819
import yaml
1920

2021
from netlab.autoresearch.backend import (
22+
DEFAULT_CLAUDE_MODEL,
23+
DEFAULT_CODEX_MODEL,
24+
DEFAULT_OPENAI_MODEL,
25+
SUPPORTED_BACKENDS,
2126
ClaudeCLIBackend,
2227
CodexCLIBackend,
2328
LLMBackend,
@@ -96,33 +101,70 @@ def _build_default_template(base_scenario_path: Path) -> str:
96101
def _build_backend(args: argparse.Namespace) -> LLMBackend:
97102
"""Construct an LLM backend from CLI arguments."""
98103
backend_name: str = args.backend
104+
backend_bin = getattr(args, "backend_bin", None)
99105

100106
if backend_name == "mock":
101107
return _build_mock_backend(args)
102108
elif backend_name == "claude-cli":
103-
return ClaudeCLIBackend()
109+
model = _resolve_model_arg(
110+
args,
111+
generic_attr="model",
112+
specific_attr="claude_model",
113+
env_var="CLAUDE_MODEL",
114+
default=DEFAULT_CLAUDE_MODEL,
115+
)
116+
return ClaudeCLIBackend(model=model, command=backend_bin)
104117
elif backend_name == "codex-cli":
105-
return CodexCLIBackend()
118+
model = _resolve_model_arg(
119+
args,
120+
generic_attr="model",
121+
specific_attr="codex_model",
122+
env_var="CODEX_MODEL",
123+
default=DEFAULT_CODEX_MODEL,
124+
)
125+
return CodexCLIBackend(model=model, command=backend_bin)
106126
elif backend_name == "openai":
107-
import os
108-
109127
base_url = getattr(args, "openai_base_url", None) or os.environ.get(
110128
"OPENAI_BASE_URL", "https://api.openai.com"
111129
)
112-
model = getattr(args, "openai_model", None) or os.environ.get(
113-
"OPENAI_MODEL", "gpt-4"
130+
model = _resolve_model_arg(
131+
args,
132+
generic_attr="model",
133+
specific_attr="openai_model",
134+
env_var="OPENAI_MODEL",
135+
default=DEFAULT_OPENAI_MODEL,
114136
)
115137
api_key = os.environ.get("OPENAI_API_KEY", "")
116138
return OpenAICompatibleBackend(base_url=base_url, model=model, api_key=api_key)
117139
else:
118140
print(
119141
f"Unknown backend: {backend_name!r}. "
120-
"Use 'mock', 'claude-cli', 'codex-cli', or 'openai'.",
142+
f"Use {', '.join(repr(name) for name in SUPPORTED_BACKENDS)}.",
121143
file=sys.stderr,
122144
)
123145
sys.exit(1)
124146

125147

148+
def _resolve_model_arg(
149+
args: argparse.Namespace,
150+
*,
151+
generic_attr: str,
152+
specific_attr: str,
153+
env_var: str,
154+
default: str,
155+
) -> str:
156+
"""Resolve model from generic CLI flag, backend-specific flag, env, then default."""
157+
generic_value = getattr(args, generic_attr, None)
158+
if generic_value:
159+
return generic_value
160+
161+
specific_value = getattr(args, specific_attr, None)
162+
if specific_value:
163+
return specific_value
164+
165+
return os.environ.get(env_var, default)
166+
167+
126168
def _build_mock_backend(args: argparse.Namespace) -> MockBackend:
127169
"""Build a MockBackend that generates plausible YAML responses.
128170
@@ -292,6 +334,7 @@ def autoresearch_run(args: argparse.Namespace) -> None:
292334
config = RunConfig(
293335
project_dir=project_dir,
294336
backend=backend,
337+
ngraph_bin=getattr(args, "ngraph_bin", None),
295338
max_experiments=args.max_experiments,
296339
timeout_s=args.timeout,
297340
seed=args.seed,

netlab/autoresearch/generation_loop.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
import yaml
2424

25+
from netlab.runtime import require_executable
26+
2527
from .backend import LLMBackend
2628

2729

@@ -51,11 +53,17 @@ def summary(self) -> str:
5153

5254
@dataclass
5355
class GenerationResult:
54-
"""Output of the generation loop."""
56+
"""Output of the generation loop.
57+
58+
On success, contains both the validated scenario and the simulation
59+
results (since the simulation is run as part of validation).
60+
"""
5561

5662
success: bool
5763
scenario_yaml: str = ""
5864
scenario_path: Path | None = None
65+
results_path: Path | None = None
66+
results_data: dict | None = None
5967
inspect: InspectResult | None = None
6068
iterations_used: int = 0
6169
error: str = ""
@@ -256,7 +264,7 @@ def _get_generation_system_prompt() -> str:
256264
"""
257265

258266
_REVISION_PROMPT_TEMPLATE = """\
259-
The scenario you generated was inspected by ngraph. Here is the result:
267+
The scenario you generated failed validation:
260268
261269
{inspect_summary}
262270
@@ -265,7 +273,13 @@ def _get_generation_system_prompt() -> str:
265273
266274
{validation_errors}
267275
268-
Please fix the scenario YAML to match the intent. Return ONLY the YAML content.
276+
Common issues:
277+
- Demand source/target regex must match existing node names
278+
- Failure rule mode must be "choice" (not "random") with count: N
279+
- All nodes referenced in links must be defined in the nodes section
280+
- WorkflowType is TrafficMatrixPlacement (not TrafficMatrixPerformance)
281+
282+
Fix the scenario YAML. Return ONLY the YAML content.
269283
"""
270284

271285

@@ -292,11 +306,10 @@ def run_generation_loop(
292306
GenerationResult with the validated scenario or error details.
293307
"""
294308
if ngraph_bin is None:
295-
ngraph_bin = shutil.which("ngraph")
296-
if ngraph_bin is None:
297-
return GenerationResult(
298-
success=False, error="ngraph binary not found on PATH"
299-
)
309+
try:
310+
ngraph_bin = require_executable("ngraph", env_var="NETLAB_NGRAPH_BIN")
311+
except RuntimeError as exc:
312+
return GenerationResult(success=False, error=str(exc))
300313

301314
cleanup_work_dir = False
302315
if work_dir is None:
@@ -359,10 +372,24 @@ def run_generation_loop(
359372
last_inspect.errors = viability_errors
360373
continue
361374

375+
# Run simulation as definitive validation.
376+
# For LLM-generated scenarios (10-20 nodes), this takes <1s.
377+
# Catches issues inspect misses: unresolved demand patterns,
378+
# invalid failure policies, workflow reference errors.
379+
sim_result = _run_simulation(scenario_path, ngraph_bin, work_dir)
380+
if not sim_result.success:
381+
last_inspect = InspectResult(
382+
success=False,
383+
errors=[f"Simulation failed: {sim_result.error}"],
384+
)
385+
continue
386+
362387
return GenerationResult(
363388
success=True,
364389
scenario_yaml=yaml_text,
365390
scenario_path=scenario_path,
391+
results_path=sim_result.results_path,
392+
results_data=sim_result.results_data,
366393
inspect=last_inspect,
367394
iterations_used=iteration + 1,
368395
)
@@ -381,6 +408,66 @@ def run_generation_loop(
381408
shutil.rmtree(work_dir, ignore_errors=True)
382409

383410

411+
@dataclass
412+
class _SimResult:
413+
"""Internal result from a trial simulation run."""
414+
415+
success: bool
416+
results_path: Path | None = None
417+
results_data: dict | None = None
418+
error: str = ""
419+
420+
421+
def _run_simulation(scenario_path: Path, ngraph_bin: str, work_dir: Path) -> _SimResult:
422+
"""Run ngraph on the scenario as a validation step.
423+
424+
Returns the results if successful, or an error message if not.
425+
Timeout is short (60s) since LLM-generated scenarios are small.
426+
"""
427+
import json
428+
429+
results_dir = work_dir / "results"
430+
results_dir.mkdir(exist_ok=True)
431+
try:
432+
proc = subprocess.run(
433+
[ngraph_bin, "run", str(scenario_path), "-o", str(results_dir)],
434+
capture_output=True,
435+
text=True,
436+
timeout=60,
437+
)
438+
except subprocess.TimeoutExpired:
439+
return _SimResult(success=False, error="Simulation timed out (60s)")
440+
441+
if proc.returncode != 0:
442+
# Extract useful error from stderr
443+
stderr = proc.stderr.strip()
444+
error_lines = [
445+
line
446+
for line in stderr.splitlines()
447+
if "error" in line.lower() or "Error" in line
448+
]
449+
error_msg = "; ".join(error_lines[-3:]) if error_lines else stderr[-300:]
450+
return _SimResult(success=False, error=error_msg)
451+
452+
# Find and load results
453+
results_files = list(results_dir.glob("*.results.json"))
454+
if not results_files:
455+
return _SimResult(success=False, error="No results file produced")
456+
457+
results_path = results_files[0]
458+
try:
459+
with results_path.open() as f:
460+
results_data = json.load(f)
461+
except (json.JSONDecodeError, OSError) as e:
462+
return _SimResult(success=False, error=f"Failed to load results: {e}")
463+
464+
return _SimResult(
465+
success=True,
466+
results_path=results_path,
467+
results_data=results_data,
468+
)
469+
470+
384471
def _check_viability(inspect: InspectResult) -> list[str]:
385472
"""Check that an inspected scenario is minimally viable.
386473

0 commit comments

Comments
 (0)