update inheritence and solve mt-bench merge problems

kargibora · kargibora · commit 9dc32b0090a5 · 2026-04-07T10:35:18.000+02:00
diff --git a/judgearena/cli_common.py b/judgearena/cli_common.py
@@ -74,8 +74,8 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None:
         help=(
             "Model comparison order mode. 'fixed': always use model order A-B. "
             "'both': correct for model order bias by evaluating each instruction "
-            "twice, once as A-B and once as B-A, and average. This helps account "
-            "for judge position bias. Default is 'fixed'."
+            "twice, once as A-B and once as B-A, and concatenating the results. "
+            "This helps account for judge position bias. Default is 'fixed'."
         ),
     )
     parser.add_argument(
diff --git a/judgearena/config.py b/judgearena/config.py
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
@@ -16,10 +16,16 @@
 
 @dataclass
 class CliEloArgs(BaseCliArgs):
-    """CLI arguments for the ELO rating estimation entrypoint."""
+    """CLI arguments for the ELO rating estimation entrypoint.
 
-    arena: str = ""
-    model: str = ""
+    Note: inheriting from a dataclass (BaseCliArgs) forces every field here to
+    have a default value, even for fields like ``arena`` and ``model`` that
+    logically should be required.  If this becomes too messy we may want to
+    move away from dataclass inheritance.
+    """
+
+    arena: str | None = None
+    model: str | None = None
     n_instructions_per_language: int | None = None
     languages: list[str] | None = None
     n_bootstraps: int = 20
diff --git a/judgearena/generate_and_evaluate.py b/judgearena/generate_and_evaluate.py
@@ -69,9 +69,9 @@ def try_load_dataset_completions(
 class CliArgs(BaseCliArgs):
     """CLI arguments for the generate-and-evaluate entrypoint."""
 
-    dataset: str = ""
-    model_A: str = ""
-    model_B: str = ""
+    dataset: str | None = None
+    model_A: str | None = None
+    model_B: str | None = None
     use_tqdm: bool = False
 
     @classmethod
diff --git a/judgearena/mt_bench/mt_bench_utils.py b/judgearena/mt_bench/mt_bench_utils.py
@@ -26,7 +26,7 @@
 from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
 
 if TYPE_CHECKING:
-    from judgearena.config import CliArgs
+    from judgearena.generate_and_evaluate import CliArgs
 
 
 def _generate_mt_bench_completions(
diff --git a/judgearena/utils.py b/judgearena/utils.py
@@ -49,23 +49,6 @@ def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
         return pd.read_parquet(filename, **pandas_kwargs)
 
 
-def truncate(s: str, max_len: int | None = None) -> str:
-    if not isinstance(s, str):
-        return ""
-    if max_len is not None:
-        return s[:max_len]
-    return s
-
-
-def safe_text(value: object, truncate_chars: int | None) -> str:
-    if value is None:
-        return ""
-    is_missing = pd.isna(value)
-    if isinstance(is_missing, bool) and is_missing:
-        return ""
-    return truncate(str(value), max_len=truncate_chars)
-
-
 def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]:
     """Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B)."""
     prefs = pd.Series(prefs, dtype="float64")
@@ -99,6 +82,20 @@ def truncate(s: str, max_len: int | None = None) -> str:
     return s
 
 
+def safe_text(value: object, truncate_chars: int | None) -> str:
+    """Coerce *value* to a string and optionally truncate.
+
+    Returns the empty string for ``None`` and NaN-like values so callers
+    don't have to guard against missing data.
+    """
+    if value is None:
+        return ""
+    is_missing = pd.isna(value)
+    if isinstance(is_missing, bool) and is_missing:
+        return ""
+    return truncate(str(value), max_len=truncate_chars)
+
+
 def do_inference(chat_model, inputs, use_tqdm: bool = False):
     # Retries on rate-limit/server errors with exponential backoff.
     # Async path retries individual calls; batch path splits into 4^attempt chunks on failure.