@@ -145,7 +145,7 @@ def _run_platform_once(task: str, args: object) -> tuple[int, dict, float]:
145145 api = _platform_api_builder ()
146146 if not api .enabled ():
147147 print (
148- "Platform routing is not configured. Run `devsper platform connect` first." ,
148+ "Cloud routing is not configured. Run `devsper platform connect` first." ,
149149 file = sys .stderr ,
150150 )
151151 return 2 , {}, 0.0
@@ -2721,6 +2721,164 @@ def _run_version(args: object) -> int:
27212721 return 0
27222722
27232723
2724+ def _run_eval (args : object ) -> int :
2725+ """Eval harness: run dataset, score results, optionally optimize prompts."""
2726+ import asyncio
2727+ import json
2728+ from pathlib import Path
2729+
2730+ eval_cmd = getattr (args , "eval_cmd" , None )
2731+
2732+ if eval_cmd == "stub" or eval_cmd is None and not hasattr (args , "dataset" ):
2733+ # Generate stub dataset
2734+ from devsper .evals .dataset import EvalDataset
2735+
2736+ role = getattr (args , "role" , "general" )
2737+ n = getattr (args , "n" , 5 )
2738+ out = getattr (args , "out" , None )
2739+ dataset = EvalDataset .stub (role = role , n = n )
2740+ if out :
2741+ dataset .save (out )
2742+ print (f"Stub dataset ({ len (dataset )} cases) written to { out } " )
2743+ else :
2744+ for case in dataset :
2745+ print (json .dumps (case .to_dict ()))
2746+ return 0
2747+
2748+ if eval_cmd == "results" :
2749+ from devsper .config import get_config
2750+
2751+ try :
2752+ results_dir = Path (getattr (args , "dir" , None ) or get_config ().evals .results_dir )
2753+ except Exception :
2754+ results_dir = Path (".devsper/eval_results" )
2755+ if not results_dir .exists ():
2756+ print (f"No results found in { results_dir } " )
2757+ return 0
2758+ files = sorted (results_dir .glob ("*.json" ), key = lambda p : p .stat ().st_mtime , reverse = True )
2759+ if not files :
2760+ print ("No eval result files found." )
2761+ return 0
2762+ for f in files [:20 ]:
2763+ try :
2764+ data = json .loads (f .read_text ())
2765+ print (
2766+ f" { f .name } role={ data .get ('role' , '?' )} "
2767+ f"pass_rate={ data .get ('pass_rate' , '?' )} "
2768+ f"mean_score={ data .get ('mean_score' , '?' )} "
2769+ )
2770+ except Exception :
2771+ print (f" { f .name } " )
2772+ return 0
2773+
2774+ # eval_cmd == "run"
2775+ from devsper .evals .dataset import EvalDataset
2776+ from devsper .evals .metrics import get_metric
2777+ from devsper .evals .runner import EvalRunner
2778+ from devsper .config import get_config
2779+
2780+ try :
2781+ cfg = get_config ()
2782+ except Exception :
2783+ from devsper .config .schema import devsperConfigModel
2784+ cfg = devsperConfigModel ()
2785+
2786+ dataset_path = getattr (args , "dataset" , None )
2787+ if not dataset_path :
2788+ print ("Error: --dataset is required for 'eval run'" )
2789+ return 1
2790+
2791+ dataset = EvalDataset .load (dataset_path )
2792+ role = getattr (args , "role" , None )
2793+ metric_name = getattr (args , "metric" , None ) or cfg .evals .default_metric
2794+ threshold = getattr (args , "threshold" , None ) or cfg .evals .pass_threshold
2795+ concurrency = getattr (args , "concurrency" , None ) or cfg .evals .concurrency
2796+ do_optimize = getattr (args , "optimize" , False )
2797+ optimizer_override = getattr (args , "optimizer" , None )
2798+ out_path = getattr (args , "out" , None )
2799+
2800+ metric = get_metric (metric_name )
2801+
2802+ # Build optimizer if requested
2803+ optimizer = None
2804+ if do_optimize :
2805+ from devsper .prompt_optimizer .factory import get_prompt_optimizer , reset_prompt_optimizer
2806+
2807+ if optimizer_override :
2808+ import os
2809+ os .environ ["DEVSPER_PROMPT_OPTIMIZER" ] = optimizer_override
2810+ reset_prompt_optimizer ()
2811+ optimizer = get_prompt_optimizer (cfg )
2812+
2813+ # Build a minimal agent for evaluation
2814+ from devsper .agents .agent import Agent
2815+
2816+ agent = Agent (model_name = cfg .models .worker , use_tools = False )
2817+
2818+ runner = EvalRunner (
2819+ agent = agent ,
2820+ metric = metric ,
2821+ pass_threshold = threshold ,
2822+ concurrency = concurrency ,
2823+ optimize_after = do_optimize ,
2824+ optimizer = optimizer ,
2825+ )
2826+
2827+ try :
2828+ summary = asyncio .run (runner .run_async (dataset , role = role ))
2829+ except RuntimeError :
2830+ loop = asyncio .new_event_loop ()
2831+ summary = loop .run_until_complete (runner .run_async (dataset , role = role ))
2832+
2833+ # Print summary
2834+ try :
2835+ from rich .console import Console
2836+ from rich .table import Table
2837+
2838+ console = Console ()
2839+ console .print (
2840+ f"\n [bold]Eval Results[/bold] role=[cyan]{ summary .role } [/cyan] "
2841+ f"metric=[cyan]{ summary .metric_name } [/cyan] "
2842+ f"optimizer=[cyan]{ summary .optimizer_backend } [/cyan]"
2843+ )
2844+ console .print (
2845+ f" Passed: [green]{ summary .passed } [/green]/{ summary .total } "
2846+ f"Pass rate: [bold]{ summary .pass_rate :.1%} [/bold] "
2847+ f"Mean score: [bold]{ summary .mean_score :.3f} [/bold]\n "
2848+ )
2849+ table = Table (show_header = True , header_style = "bold" )
2850+ table .add_column ("ID" , style = "dim" )
2851+ table .add_column ("Task" , max_width = 40 )
2852+ table .add_column ("Score" )
2853+ table .add_column ("Pass" )
2854+ for r in summary .results :
2855+ color = "green" if r .passed else "red"
2856+ table .add_row (
2857+ r .case .id ,
2858+ r .case .task [:40 ],
2859+ f"{ r .score :.2f} " ,
2860+ f"[{ color } ]{ '✓' if r .passed else '✗' } [/{ color } ]" ,
2861+ )
2862+ console .print (table )
2863+ except ImportError :
2864+ print (f"\n Eval: role={ summary .role } metric={ summary .metric_name } " )
2865+ print (f" { summary .passed } /{ summary .total } passed ({ summary .pass_rate :.1%} )" )
2866+ print (f" Mean score: { summary .mean_score :.3f} " )
2867+
2868+ # Persist results
2869+ results_dir = Path (cfg .evals .results_dir )
2870+ results_dir .mkdir (parents = True , exist_ok = True )
2871+ ts = __import__ ("datetime" ).datetime .now ().strftime ("%Y%m%d_%H%M%S" )
2872+ result_file = results_dir / f"eval_{ summary .role } _{ ts } .json"
2873+ result_file .write_text (summary .to_json ())
2874+ print (f"\n Results saved to { result_file } " )
2875+
2876+ if out_path :
2877+ Path (out_path ).write_text (summary .to_json ())
2878+
2879+ return 0 if summary .pass_rate >= threshold else 1
2880+
2881+
27242882def _run_health (args : object ) -> int :
27252883 """Run health checks. Exit 0 if healthy, 1 otherwise. Print ✓/✗ per check."""
27262884 import asyncio
@@ -2811,7 +2969,7 @@ def _run_upgrade(args: object) -> int:
28112969
28122970
28132971def _run_cloud_dispatch (args : object ) -> int :
2814- """Devsper Platform (cloud) : login, run, status, logs."""
2972+ """Devsper Cloud : login, run, status, logs."""
28152973 cmd = getattr (args , "cloud_cmd" , None )
28162974 if not cmd :
28172975 return 0
@@ -4082,6 +4240,64 @@ def main() -> int:
40824240 )
40834241 observe_parser .set_defaults (func = lambda a : _run_observe (a .port , a .db ))
40844242
4243+ eval_parser = subparsers .add_parser (
4244+ "eval" ,
4245+ help = "Eval harness and prompt optimization" ,
4246+ description = "Run evals against a JSONL dataset and optionally optimize prompts." ,
4247+ epilog = """
4248+ Examples:
4249+ devsper eval run --dataset evals.jsonl --metric contains
4250+ devsper eval run --dataset evals.jsonl --role research --optimize --optimizer dspy
4251+ devsper eval stub --role research --out evals.jsonl
4252+ devsper eval results
4253+ """ ,
4254+ formatter_class = argparse .RawDescriptionHelpFormatter ,
4255+ )
4256+ eval_sub = eval_parser .add_subparsers (dest = "eval_cmd" , help = "Subcommand" )
4257+
4258+ eval_run_p = eval_sub .add_parser ("run" , help = "Run eval dataset" )
4259+ eval_run_p .add_argument ("--dataset" , required = True , help = "Path to JSONL dataset" )
4260+ eval_run_p .add_argument ("--role" , default = None , help = "Filter to this agent role" )
4261+ eval_run_p .add_argument (
4262+ "--metric" ,
4263+ default = None ,
4264+ help = "Metric name: exact_match | contains | regex_match | word_overlap | llm_judge (default: config)" ,
4265+ )
4266+ eval_run_p .add_argument (
4267+ "--threshold" , type = float , default = None , help = "Pass threshold (default: config)"
4268+ )
4269+ eval_run_p .add_argument (
4270+ "--optimize" ,
4271+ action = "store_true" ,
4272+ help = "Run prompt optimization after eval using the configured optimizer" ,
4273+ )
4274+ eval_run_p .add_argument (
4275+ "--optimizer" ,
4276+ default = None ,
4277+ help = "Override optimizer backend: noop | dspy | gepa" ,
4278+ )
4279+ eval_run_p .add_argument (
4280+ "--concurrency" , type = int , default = None , help = "Parallel eval cases"
4281+ )
4282+ eval_run_p .add_argument ("--out" , default = None , help = "Save JSON results to this path" )
4283+ eval_run_p .set_defaults (eval_cmd = "run" )
4284+
4285+ eval_stub_p = eval_sub .add_parser ("stub" , help = "Generate a stub dataset" )
4286+ eval_stub_p .add_argument (
4287+ "--role" , default = "general" , help = "Agent role (research/code/analysis/general)"
4288+ )
4289+ eval_stub_p .add_argument ("-n" , type = int , default = 5 , help = "Number of examples" )
4290+ eval_stub_p .add_argument (
4291+ "--out" , default = None , help = "Output JSONL path (default: prints to stdout)"
4292+ )
4293+ eval_stub_p .set_defaults (eval_cmd = "stub" )
4294+
4295+ eval_results_p = eval_sub .add_parser ("results" , help = "List recent eval result files" )
4296+ eval_results_p .add_argument ("--dir" , default = None , help = "Results directory" )
4297+ eval_results_p .set_defaults (eval_cmd = "results" )
4298+
4299+ eval_parser .set_defaults (func = _run_eval )
4300+
40854301 health_parser = subparsers .add_parser (
40864302 "health" ,
40874303 help = "Health and readiness check" ,
0 commit comments