Skip to content

Commit 9dc32b0

Browse files
committed
update inheritence and solve mt-bench merge problems
1 parent 4cd8c93 commit 9dc32b0

6 files changed

Lines changed: 29 additions & 207 deletions

File tree

judgearena/cli_common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ def add_common_arguments(parser: argparse.ArgumentParser) -> None:
7474
help=(
7575
"Model comparison order mode. 'fixed': always use model order A-B. "
7676
"'both': correct for model order bias by evaluating each instruction "
77-
"twice, once as A-B and once as B-A, and average. This helps account "
78-
"for judge position bias. Default is 'fixed'."
77+
"twice, once as A-B and once as B-A, and concatenating the results. "
78+
"This helps account for judge position bias. Default is 'fixed'."
7979
),
8080
)
8181
parser.add_argument(

judgearena/config.py

Lines changed: 0 additions & 181 deletions
This file was deleted.

judgearena/estimate_elo_ratings.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,16 @@
1616

1717
@dataclass
1818
class CliEloArgs(BaseCliArgs):
19-
"""CLI arguments for the ELO rating estimation entrypoint."""
19+
"""CLI arguments for the ELO rating estimation entrypoint.
2020
21-
arena: str = ""
22-
model: str = ""
21+
Note: inheriting from a dataclass (BaseCliArgs) forces every field here to
22+
have a default value, even for fields like ``arena`` and ``model`` that
23+
logically should be required. If this becomes too messy we may want to
24+
move away from dataclass inheritance.
25+
"""
26+
27+
arena: str | None = None
28+
model: str | None = None
2329
n_instructions_per_language: int | None = None
2430
languages: list[str] | None = None
2531
n_bootstraps: int = 20

judgearena/generate_and_evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,9 @@ def try_load_dataset_completions(
6969
class CliArgs(BaseCliArgs):
7070
"""CLI arguments for the generate-and-evaluate entrypoint."""
7171

72-
dataset: str = ""
73-
model_A: str = ""
74-
model_B: str = ""
72+
dataset: str | None = None
73+
model_A: str | None = None
74+
model_B: str | None = None
7575
use_tqdm: bool = False
7676

7777
@classmethod

judgearena/mt_bench/mt_bench_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
2727

2828
if TYPE_CHECKING:
29-
from judgearena.config import CliArgs
29+
from judgearena.generate_and_evaluate import CliArgs
3030

3131

3232
def _generate_mt_bench_completions(

judgearena/utils.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,23 +49,6 @@ def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
4949
return pd.read_parquet(filename, **pandas_kwargs)
5050

5151

52-
def truncate(s: str, max_len: int | None = None) -> str:
53-
if not isinstance(s, str):
54-
return ""
55-
if max_len is not None:
56-
return s[:max_len]
57-
return s
58-
59-
60-
def safe_text(value: object, truncate_chars: int | None) -> str:
61-
if value is None:
62-
return ""
63-
is_missing = pd.isna(value)
64-
if isinstance(is_missing, bool) and is_missing:
65-
return ""
66-
return truncate(str(value), max_len=truncate_chars)
67-
68-
6952
def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]:
7053
"""Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B)."""
7154
prefs = pd.Series(prefs, dtype="float64")
@@ -99,6 +82,20 @@ def truncate(s: str, max_len: int | None = None) -> str:
9982
return s
10083

10184

85+
def safe_text(value: object, truncate_chars: int | None) -> str:
86+
"""Coerce *value* to a string and optionally truncate.
87+
88+
Returns the empty string for ``None`` and NaN-like values so callers
89+
don't have to guard against missing data.
90+
"""
91+
if value is None:
92+
return ""
93+
is_missing = pd.isna(value)
94+
if isinstance(is_missing, bool) and is_missing:
95+
return ""
96+
return truncate(str(value), max_len=truncate_chars)
97+
98+
10299
def do_inference(chat_model, inputs, use_tqdm: bool = False):
103100
# Retries on rate-limit/server errors with exponential backoff.
104101
# Async path retries individual calls; batch path splits into 4^attempt chunks on failure.

0 commit comments

Comments
 (0)