OpenEuroLLM
diff --git a/‎.github/workflows/run-pytest.yml‎ ‎.github/workflows/ci.yml‎.github/workflows/run-pytest.yml renamed to .github/workflows/ci.yml
Lines changed: 7 additions & 5 deletions b/‎.github/workflows/run-pytest.yml‎ ‎.github/workflows/ci.yml‎.github/workflows/run-pytest.yml renamed to .github/workflows/ci.yml
Lines changed: 7 additions & 5 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 14 additions & 5 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 14 additions & 5 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 7 deletions b/‎README.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎TODOs.md‎
Lines changed: 3 additions & 3 deletions b/‎TODOs.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎judgearena/arenas_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎judgearena/arenas_utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎judgearena/criteria/io.py‎
Lines changed: 1 addition & 2 deletions b/‎judgearena/criteria/io.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎judgearena/criteria/schema.py‎
Lines changed: 0 additions & 1 deletion b/‎judgearena/criteria/schema.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎judgearena/estimate_elo_ratings.py‎
Lines changed: 11 additions & 9 deletions b/‎judgearena/estimate_elo_ratings.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎judgearena/evaluate.py‎
Lines changed: 18 additions & 17 deletions b/‎judgearena/evaluate.py‎
Lines changed: 18 additions & 17 deletions
@@ -1,7 +1,7 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# Install Python dependencies, run pre-commit (lint/format), and pytest.
 # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 
-name: Run pytest
+name: CI
 
 on:
   push:
@@ -14,7 +14,7 @@ permissions:
 
 jobs:
   build:
-  
+
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
@@ -24,6 +24,8 @@ jobs:
           enable-cache: true
           python-version: "3.12"
       - name: Install dependencies
-        run: uv sync --all-extras --group dev
+        run: uv sync
+      - name: Run pre-commit
+        run: uv run pre-commit run --all-files
       - name: Test with pytest
-        run: uv run pytest
+        run: uv run pytest
@@ -1,14 +1,23 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
       - id: check-added-large-files
 
-  - repo: https://github.com/psf/black
-    rev: 24.1.1
+  - repo: local
     hooks:
-      - id: black
-        language_version: python3
+      - id: ruff
+        name: ruff
+        entry: uv run ruff check --fix --force-exclude
+        language: system
+        types_or: [python, pyi]
+        require_serial: true
+      - id: ruff-format
+        name: ruff-format
+        entry: uv run ruff format --force-exclude
+        language: system
+        types_or: [python, pyi]
+        require_serial: true
@@ -24,7 +24,7 @@ Compared to other libraries, here is a breakdown of features:
 | **Evalchemy** | ✅  | ✅  | ❌  | ❌  | ❌                         | ❌                                           |
 | **JudgeArena** | 🔜  | ✅  | ✅  | ✅  | ✅                         | ✅                                          |
 
-The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue 
+The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue
 or send a PR, we will be happy to update the information.
 
 ## 🚀 Quick Start
@@ -34,7 +34,7 @@ or send a PR, we will be happy to update the information.
 ```bash
 git clone https://github.com/OpenEuroLLM/JudgeArena
 cd JudgeArena
-uv sync 
+uv sync
 uv sync --extra vllm      # Optional: install vLLM support
 uv sync --extra llamacpp   # Optional: install LlamaCpp support
 ```
@@ -49,19 +49,19 @@ python judgearena/generate_and_evaluate.py \
   --model_A gpt4_1106_preview \
   --model_B VLLM/utter-project/EuroLLM-9B \
   --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
-  --n_instructions 10 
+  --n_instructions 10
 ```
 
 **What happens here?**
 - Use completions available for `gpt4_1106_preview` in Alpaca-Eval dataset
 - Generates completions for `model_B` if not already cached on `vLLM`
-- Compares two models using `deepseek-chat-v3.1` which the cheapest option available on `OpenRouter` 
+- Compares two models using `deepseek-chat-v3.1` which the cheapest option available on `OpenRouter`
 
 It will then display the results of the battles:
 
 ```bash
 ============================================================
-                  🏆 MODEL BATTLE RESULTS 🏆                  
+                  🏆 MODEL BATTLE RESULTS 🏆
 📊 Dataset: alpaca-eval
 🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B
 ⚖️ Judge: OpenRouter/deepseek/deepseek-chat-v3.1
@@ -84,7 +84,7 @@ The evaluation scripts expose four different length controls with different role
 
 ### Engine-Specific Configuration (`--engine_kwargs`)
 
-Some providers expose additional engine-level knobs (for example, vLLM allows configuring tensor parallelism or GPU memory utilization).  
+Some providers expose additional engine-level knobs (for example, vLLM allows configuring tensor parallelism or GPU memory utilization).
 JudgeArena lets you forward these options directly to the underlying engine via `--engine_kwargs`, which expects a JSON object.
 
 For instance, to run vLLM with tensor parallelism across multiple GPUs:
@@ -123,7 +123,7 @@ python judgearena/generate_and_evaluate.py \
   --model_A VLLM/Qwen/Qwen2.5-0.5B-Instruct \
   --model_B VLLM/Qwen/Qwen2.5-1.5B-Instruct \
   --judge_model VLLM/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8 \
-  --n_instructions 10 
+  --n_instructions 10
 ```
 
 ### Running locally with LlamaCpp
 
@@ -6,7 +6,7 @@ TODOs:
 * CI [high/large]
 * implement CI judge option
 * implement domain filter in CI (maybe pass a regexp by column?)
-* report cost? 
+* report cost?
 
 Done:
 * support alpaca-eval
@@ -22,7 +22,7 @@ Done:
 * CLI launcher [medium/large]
 * put contexts in HF dataset [high/small]
 * mAH: instruction loader [DONE]
-* mAH: generate instructions for two models [DONE] 
+* mAH: generate instructions for two models [DONE]
 * mAH: make comparison [DONE]
 * mAH: support using all languages at once [high/medium]
 * unit-test
@@ -37,4 +37,4 @@ Done:
   * small refactor `annotate` needs to return just the judge completion, not the parsed one
   * perhaps change to `annotate_pair` and `annotate_single`
   * then provide example
-* support evaluation with input swap 
+* support evaluation with input swap
@@ -82,7 +82,8 @@ def get_winner(
                     return None
                 if chosen_model_name not in [model_a, model_b]:
                     warnings.warn(
-                        f"Chosen model {chosen_model_name!r} not in model_a={model_a!r} or model_b={model_b!r}; skipping."
+                        f"Chosen model {chosen_model_name!r} not in model_a={model_a!r} or model_b={model_b!r}; skipping.",
+                        stacklevel=2,
                     )
                     return None
                 return "model_a" if chosen_model_name == model_a else "model_b"
 
@@ -15,8 +15,7 @@ def _load_criteria_data(path: str | Path) -> dict:
     suffix = path.suffix.lower()
     if suffix not in {".yaml", ".yml"}:
         raise ValueError(
-            f"Unsupported criteria file format '{path.suffix}'. "
-            "Use .yaml or .yml."
+            f"Unsupported criteria file format '{path.suffix}'. Use .yaml or .yml."
         )
 
     data = yaml.safe_load(path.read_text())
 
@@ -5,7 +5,6 @@
 from dataclasses import dataclass, field
 from typing import Any
 
-
 SCALE_MIN = 1
 SCALE_MAX = 10
 
 
@@ -8,10 +8,10 @@
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 
-from judgearena.arenas_utils import load_arena_dataframe, _extract_instruction_text
+from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
 from judgearena.evaluate import judge_and_parse_prefs
 from judgearena.generate import generate_instructions
-from judgearena.utils import make_model, cache_function_dataframe, compute_pref_summary
+from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
 
 
 @dataclass
@@ -38,9 +38,9 @@ class CliEloArgs:
 
     def __post_init__(self):
         supported_modes = ["fixed", "both"]
-        assert (
-            self.swap_mode in supported_modes
-        ), f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+        assert self.swap_mode in supported_modes, (
+            f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+        )
 
     @classmethod
     def parse_args(cls):
@@ -201,7 +201,7 @@ def parse_args(cls):
             if not isinstance(engine_kwargs, dict):
                 raise ValueError("engine_kwargs must be a JSON object")
         except Exception as e:
-            raise SystemExit(f"Failed to parse --engine_kwargs: {e}")
+            raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
 
         return cls(
             arena=args.arena,
@@ -397,7 +397,9 @@ def main(args: CliEloArgs | None = None) -> dict:
         **extra_kwargs,
     )
 
-    replace_slash = lambda s: s.replace("/", "_")
+    def replace_slash(s: str) -> str:
+        return s.replace("/", "_")
+
     languages_str = "-".join(sorted(args.languages)) if args.languages else "all"
     extra_kwargs_str = (
         "_".join(f"{k}={v}" for k, v in sorted(extra_kwargs.items()))
@@ -511,7 +513,7 @@ def run_judge() -> pd.DataFrame:
     model_name = args.model
     battle_results = []
     for pref, is_pos_a, opp_model in zip(
-        prefs, our_model_is_position_a, opponent_models
+        prefs, our_model_is_position_a, opponent_models, strict=True
     ):
         if pref is None or pref == 0.5:
             winner = "tie"
@@ -536,7 +538,7 @@ def run_judge() -> pd.DataFrame:
     prefs_normalized = pd.Series(
         [
             p if (p is None or is_pos_a) else (1 - p)
-            for p, is_pos_a in zip(prefs, our_model_is_position_a)
+            for p, is_pos_a in zip(prefs, our_model_is_position_a, strict=True)
         ]
     )
     summary = compute_pref_summary(prefs_normalized)
 
@@ -1,22 +1,22 @@
 import json
 import re
 from dataclasses import dataclass
-from datetime import datetime, timezone
+from datetime import UTC, datetime
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
-from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.language_models.llms import LLM
+from langchain_core.prompts import ChatPromptTemplate
 
 from judgearena.instruction_dataset import load_instructions
-from judgearena.repro import write_run_metadata, _to_jsonable
+from judgearena.repro import _to_jsonable, write_run_metadata
 from judgearena.utils import (
     compute_pref_summary,
-    read_df,
     data_root,
-    download_hf,
     do_inference,
+    download_hf,
+    read_df,
 )
 
 
@@ -55,13 +55,13 @@ def load_judge_system_and_user_prompt(
     provide_explanation: bool = True,
 ) -> tuple[str, str]:
     # Prepare judge
-    with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f:
+    with open(Path(__file__).parent / "prompts" / "system-prompt.txt") as f:
         system_prompt = str(f.read())
 
     prompt_filename = (
         "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
     )
-    with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f:
+    with open(Path(__file__).parent / "prompts" / prompt_filename) as f:
         user_prompt_template = str(f.read())
 
     return system_prompt, user_prompt_template
@@ -109,7 +109,7 @@ def evaluate_completions(
     exceeding context limit
     :return:
     """
-    run_started_at = datetime.now(timezone.utc)
+    run_started_at = datetime.now(UTC)
     local_path_tables = data_root / "tables"
     download_hf(name=dataset, local_path=local_path_tables)
 
@@ -140,9 +140,9 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
             return df.loc[:, "output"]
         else:
             print(f"Loading {method} from {dataset} dataset.")
-            assert (
-                method in df_outputs.columns
-            ), f"Method {method} not present, pick among {df_outputs.columns.tolist()}"
+            assert method in df_outputs.columns, (
+                f"Method {method} not present, pick among {df_outputs.columns.tolist()}"
+            )
             return df_outputs.loc[:, method].sort_index()
 
     completions_A = get_output(df_outputs=df_outputs, dataset=dataset, method=method_A)
@@ -151,9 +151,9 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
         instructions = instructions.head(num_annotations)
         completions_A = completions_A.head(num_annotations)
         completions_B = completions_B.head(num_annotations)
-    assert (
-        completions_A.index.tolist() == completions_B.index.tolist()
-    ), f"Index mismatch between methods {method_A} and {method_B}."
+    assert completions_A.index.tolist() == completions_B.index.tolist(), (
+        f"Index mismatch between methods {method_A} and {method_B}."
+    )
 
     if judge_chat_model is None:
         from langchain_together.llms import Together
@@ -303,7 +303,7 @@ def truncate(s: str, max_len: int | None = None):
                 "completion_B": truncate(completion_B, max_len=truncate_input_chars),
             }
             for user_prompt, completion_A, completion_B in zip(
-                instructions, completions_A, completions_B
+                instructions, completions_A, completions_B, strict=True
             )
         ]
     )
@@ -316,7 +316,7 @@ def truncate(s: str, max_len: int | None = None):
 
     annotations = []
     for judge_completion, instruction, completion_A, completion_B in zip(
-        judge_completions, instructions, completions_A, completions_B
+        judge_completions, instructions, completions_A, completions_B, strict=True
     ):
         annotations.append(
             JudgeAnnotation(
@@ -381,7 +381,8 @@ def judge_and_parse_prefs(
             use_tqdm=use_tqdm,
         )
 
-    _none_to_nan = lambda x: float("nan") if x is None else x
+    def _none_to_nan(x):
+        return float("nan") if x is None else x
 
     score_parser = PairScore()
     prefs = pd.Series(
Original file line number	Diff line number	Diff line change
`@@ -15,8 +15,7 @@ def _load_criteria_data(path: str \| Path) -> dict:`
`15`	`15`	`suffix = path.suffix.lower()`
`16`	`16`	`if suffix not in {".yaml", ".yml"}:`
`17`	`17`	`raise ValueError(`
`18`		`- f"Unsupported criteria file format '{path.suffix}'. "`
`19`		`- "Use .yaml or .yml."`
	`18`	`+ f"Unsupported criteria file format '{path.suffix}'. Use .yaml or .yml."`
`20`	`19`	`)`
`21`	`20`
`22`	`21`	`data = yaml.safe_load(path.read_text())`