OpenEuroLLM
diff --git a/‎judgearena/config.py‎
Lines changed: 181 additions & 0 deletions b/‎judgearena/config.py‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎judgearena/eval_utils.py‎
Lines changed: 155 additions & 0 deletions b/‎judgearena/eval_utils.py‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎judgearena/evaluate.py‎
Lines changed: 22 additions & 6 deletions b/‎judgearena/evaluate.py‎
Lines changed: 22 additions & 6 deletions
@@ -0,0 +1,181 @@
+"""CLI argument configuration for generation and evaluation entrypoints."""
+
+import argparse
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class CliArgs:
+    dataset: str
+    model_A: str
+    model_B: str
+    judge_model: str
+
+    n_instructions: int | None = None
+    provide_explanation: bool = False
+    swap_mode: str = "fixed"
+    ignore_cache: bool = False
+    use_tqdm: bool = False
+    truncate_all_input_chars: int = 8192
+    max_out_tokens_models: int = 32768
+    max_out_tokens_judge: int = 32768
+    max_model_len: int | None = None
+    chat_template: str | None = None
+    result_folder: str = "results"
+    engine_kwargs: dict = field(default_factory=dict)
+
+    def __post_init__(self):
+        supported_modes = ["fixed", "both"]
+        assert self.swap_mode in supported_modes, (
+            f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+        )
+
+    @classmethod
+    def parse_args(cls):
+        parser = argparse.ArgumentParser(
+            prog="Generate completion and evaluate with a judge",
+        )
+        parser.add_argument(
+            "--dataset",
+            help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
+            "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
+        )
+        parser.add_argument(
+            "--model_A",
+            required=True,
+            help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+        )
+        parser.add_argument(
+            "--model_B",
+            required=True,
+            help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+        )
+        parser.add_argument(
+            "--judge_model",
+            required=True,
+            help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
+            "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc",
+        )
+        parser.add_argument(
+            "--n_instructions",
+            type=int,
+            required=False,
+        )
+        parser.add_argument(
+            "--provide_explanation",
+            action="store_true",
+            help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
+            "the accuracy of the judge but enables some result interpretation.",
+        )
+        parser.add_argument(
+            "--swap_mode",
+            type=str,
+            choices=["fixed", "both"],
+            default="fixed",
+            help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
+            "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
+            "for judge position bias. Default is 'fixed'.",
+        )
+        parser.add_argument(
+            "--ignore_cache",
+            action="store_true",
+            help="If specified, ignore cache of previous completions.",
+        )
+        parser.add_argument(
+            "--use_tqdm",
+            action="store_true",
+            help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
+        )
+        parser.add_argument(
+            "--result_folder",
+            type=str,
+            required=False,
+            default="results",
+            help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
+            " `[result_folder]/[evaluation_name]`.",
+        )
+        parser.add_argument(
+            "--truncate_all_input_chars",
+            type=int,
+            required=False,
+            default=8192,
+            help="Character-level truncation applied before tokenization: truncates each instruction "
+            "before model A/B generation and truncates each completion before judge evaluation.",
+        )
+        parser.add_argument(
+            "--max_out_tokens_models",
+            type=int,
+            required=False,
+            default=32768,
+            help=(
+                "Generation token budget for each model A/B response. For VLLM, keep this <= "
+                "--max_model_len (if provided)."
+            ),
+        )
+        parser.add_argument(
+            "--max_out_tokens_judge",
+            type=int,
+            required=False,
+            default=32768,
+            help=(
+                "Generation token budget for the judge response (reasoning + scores). For "
+                "VLLM, keep this <= --max_model_len (if provided)."
+            ),
+        )
+        parser.add_argument(
+            "--max_model_len",
+            type=int,
+            required=False,
+            default=None,
+            help=(
+                "Optional total context window for VLLM models (prompt + generation). This is "
+                "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap "
+                "generated tokens. This is useful on smaller GPUs to avoid OOM."
+            ),
+        )
+        parser.add_argument(
+            "--chat_template",
+            type=str,
+            required=False,
+            default=None,
+            help="Jinja2 chat template string to use instead of the model's tokenizer template. "
+            "If not provided, ChatML is used as fallback for models without a chat template.",
+        )
+        parser.add_argument(
+            "--engine_kwargs",
+            type=str,
+            required=False,
+            default="{}",
+            help=(
+                "JSON dict of engine-specific kwargs forwarded to the underlying engine. "
+                'Example for vLLM: \'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.'
+            ),
+        )
+        args = parser.parse_args()
+
+        try:
+            engine_kwargs = json.loads(args.engine_kwargs) if args.engine_kwargs else {}
+            if not isinstance(engine_kwargs, dict):
+                raise ValueError("engine_kwargs must be a JSON object")
+        except Exception as e:
+            raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
+
+        return cls(
+            dataset=args.dataset,
+            model_A=args.model_A,
+            model_B=args.model_B,
+            judge_model=args.judge_model,
+            n_instructions=args.n_instructions,
+            provide_explanation=args.provide_explanation,
+            swap_mode=args.swap_mode,
+            ignore_cache=args.ignore_cache,
+            use_tqdm=args.use_tqdm,
+            truncate_all_input_chars=args.truncate_all_input_chars,
+            max_out_tokens_models=args.max_out_tokens_models,
+            max_out_tokens_judge=args.max_out_tokens_judge,
+            max_model_len=args.max_model_len,
+            chat_template=args.chat_template,
+            result_folder=args.result_folder,
+            engine_kwargs=engine_kwargs,
+        )
@@ -0,0 +1,155 @@
+"""Shared evaluation runtime helpers used by entrypoints and benchmark pipelines."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import pandas as pd
+
+from judgearena.evaluate import PairScore, annotate_battles
+from judgearena.utils import compute_pref_summary
+
+
+def print_results(results):
+    """Print battle results in a readable format."""
+    print("\n" + "=" * 60)
+    print("🏆 MODEL BATTLE RESULTS 🏆".center(60))
+    print(f"📊 Dataset: {results['dataset']}")
+    print(
+        f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}"
+    )
+    print(f"⚖️ Judge: {results['judge_model']}")
+    print("📈 Results Summary:")
+    print(f"   Total Battles: {results['num_battles']}")
+    print(f"   Win Rate (A): {results['winrate']:.1%}")
+    print(f"   ✅ Wins:   {results['num_wins']}")
+    print(f"   ❌ Losses: {results['num_losses']}")
+    print(f"   🤝 Ties:   {results['num_ties']}")
+    if results.get("num_missing", 0) > 0:
+        print(f"   ❓ Missing: {results['num_missing']}")
+
+    per_category = results.get("per_category")
+    if per_category:
+        print("\nPer-Category Breakdown:")
+        print(
+            f"  {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}"
+        )
+        print(f"  {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}")
+        for cat, stats in sorted(per_category.items()):
+            print(
+                f"  {cat:<14} | {stats['winrate']:>11.1%} | "
+                f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}"
+            )
+
+    per_turn = results.get("per_turn")
+    if per_turn:
+        print("\nPer-Turn Breakdown:")
+        for turn, stats in sorted(per_turn.items()):
+            print(
+                f"  Turn {turn} Win Rate(A): {stats['winrate']:.1%} "
+                f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})"
+            )
+    print("=" * 60 + "\n")
+
+
+def _compute_grouped_stats(
+    preferences: pd.Series,
+    metadata: list[dict[str, object]],
+    group_by: str,
+) -> dict[object, dict[str, float | int]]:
+    grouped: dict[object, list[float]] = {}
+    for meta, pref in zip(metadata, preferences, strict=True):
+        key = meta.get(group_by)
+        if key is None:
+            continue
+        grouped.setdefault(key, []).append(pref)
+    return {key: compute_pref_summary(pd.Series(vals)) for key, vals in grouped.items()}
+
+
+def _parse_preferences_from_annotations(
+    annotations: list,
+    score_parser: PairScore,
+) -> pd.Series:
+    return pd.Series(
+        [
+            score_parser.parse_model_raw(annotation.judge_completion)
+            for annotation in annotations
+        ]
+    )
+
+
+@dataclass
+class JudgeAnnotationResult:
+    annotations: list
+    annotations_reversed: list
+    metadata_for_annotations: list[dict[str, object]]
+    metadata_for_reversed_annotations: list[dict[str, object]]
+    preferences: pd.Series
+    combined_metadata: list[dict[str, object]]
+
+
+def _make_judge_annotation(
+    *,
+    judge_chat_model,
+    instructions: list[str],
+    completions_A: list[str],
+    completions_B: list[str],
+    metadata: list[dict[str, object]],
+    score_parser: PairScore,
+    provide_explanation: bool,
+    swap_mode: str,
+    truncate_input_chars: int | None,
+    use_tqdm: bool,
+    system_prompt: str | None = None,
+    user_prompt_template: str | None = None,
+) -> JudgeAnnotationResult:
+    if not instructions:
+        raise ValueError("instructions must be non-empty")
+
+    annotations = annotate_battles(
+        judge_chat_model=judge_chat_model,
+        instructions=instructions,
+        completions_A=completions_A,
+        completions_B=completions_B,
+        provide_explanation=provide_explanation,
+        system_prompt=system_prompt,
+        user_prompt_template=user_prompt_template,
+        truncate_input_chars=truncate_input_chars,
+        use_tqdm=use_tqdm,
+    )
+    preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)]
+
+    annotations_reversed: list = []
+    metadata_for_reversed_annotations: list[dict[str, object]] = []
+    combined_metadata = list(metadata)
+
+    if swap_mode == "both":
+        print("Correction for judge bias towards a certain model position is set.")
+        print("Evaluating completions with models reversed.")
+        annotations_reversed = annotate_battles(
+            judge_chat_model=judge_chat_model,
+            instructions=instructions,
+            completions_A=completions_B,
+            completions_B=completions_A,
+            provide_explanation=provide_explanation,
+            system_prompt=system_prompt,
+            user_prompt_template=user_prompt_template,
+            truncate_input_chars=truncate_input_chars,
+            use_tqdm=use_tqdm,
+        )
+        prefs_reversed = _parse_preferences_from_annotations(
+            annotations_reversed, score_parser
+        )
+        preference_parts.append(1 - prefs_reversed)
+        metadata_for_reversed_annotations = list(metadata)
+        combined_metadata.extend(metadata)
+
+    preferences = pd.concat(preference_parts).reset_index(drop=True)
+    return JudgeAnnotationResult(
+        annotations=annotations,
+        annotations_reversed=annotations_reversed,
+        metadata_for_annotations=list(metadata),
+        metadata_for_reversed_annotations=metadata_for_reversed_annotations,
+        preferences=preferences,
+        combined_metadata=combined_metadata,
+    )
@@ -51,30 +51,46 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
             return float(m.group(group_index).strip(" "))
 
 
+_COMPLETION_LABEL_SINGLE = "Answer"
+_COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
+_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
+_SCORE_FENCE = "\n```"
+
+
 def load_judge_system_and_user_prompt(
     provide_explanation: bool = True,
+    multi_turn: bool = False,
 ) -> tuple[str, str]:
-    # Prepare judge
-    with open(Path(__file__).parent / "prompts" / "system-prompt.txt") as f:
-        system_prompt = str(f.read())
+    prompts_dir = Path(__file__).parent / "prompts"
+    system_prompt = (prompts_dir / "system-prompt.txt").read_text()
 
     prompt_filename = (
         "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
     )
-    with open(Path(__file__).parent / "prompts" / prompt_filename) as f:
-        user_prompt_template = str(f.read())
+    user_prompt_template = (prompts_dir / prompt_filename).read_text()
+    user_prompt_template = user_prompt_template.replace(
+        "{completion_label}",
+        _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE,
+    )
+    user_prompt_template = user_prompt_template.replace(
+        "{explanation_suffix}",
+        _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE,
+    )
 
     return system_prompt, user_prompt_template
 
 
 def resolve_judge_prompts(
     *,
     provide_explanation: bool,
+    multi_turn: bool = False,
     system_prompt: str | None = None,
     user_prompt_template: str | None = None,
 ) -> tuple[str, str]:
     default_system_prompt, default_user_prompt_template = (
-        load_judge_system_and_user_prompt(provide_explanation=provide_explanation)
+        load_judge_system_and_user_prompt(
+            provide_explanation=provide_explanation, multi_turn=multi_turn
+        )
     )
     return (
         system_prompt if system_prompt is not None else default_system_prompt,