Skip to content

Commit cc440e5

Browse files
authored
Unify CLI configuration & deduplicate truncate (#31)
* Refactor CLI argument handling by unifying common configurations and removing duplication across entrypoints * update inheritence and solve mt-bench merge problems * remove unused import
1 parent 31dc7a3 commit cc440e5

8 files changed

Lines changed: 231 additions & 480 deletions

File tree

judgearena/cli_common.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""Shared CLI configuration for judgearena entrypoints.
2+
3+
Houses the base dataclass fields and argparse definitions that are common
4+
to both ``judgearena`` (generate_and_evaluate) and ``judgearena-elo``
5+
(estimate_elo_ratings) CLI tools.
6+
"""
7+
8+
from __future__ import annotations
9+
10+
import argparse
11+
import json
12+
from dataclasses import dataclass, field
13+
14+
15+
@dataclass
16+
class BaseCliArgs:
17+
"""Fields shared by every judgearena CLI entrypoint."""
18+
19+
judge_model: str
20+
21+
n_instructions: int | None = None
22+
provide_explanation: bool = False
23+
swap_mode: str = "fixed"
24+
ignore_cache: bool = False
25+
truncate_all_input_chars: int = 8192
26+
max_out_tokens_models: int = 32768
27+
max_out_tokens_judge: int = 32768
28+
max_model_len: int | None = None
29+
chat_template: str | None = None
30+
result_folder: str = "results"
31+
engine_kwargs: dict = field(default_factory=dict)
32+
33+
def __post_init__(self):
34+
supported_modes = ["fixed", "both"]
35+
assert self.swap_mode in supported_modes, (
36+
f"Only {supported_modes} modes are supported but got {self.swap_mode}."
37+
)
38+
39+
40+
def add_common_arguments(parser: argparse.ArgumentParser) -> None:
41+
"""Register the CLI flags shared by all judgearena entrypoints."""
42+
parser.add_argument(
43+
"--judge",
44+
"--judge_model",
45+
dest="judge_model",
46+
required=True,
47+
help=(
48+
"Name of the LLM to use as judge, for instance "
49+
"`Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
50+
"`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
51+
"`LlamaCpp/path/to/model.gguf` etc"
52+
),
53+
)
54+
parser.add_argument(
55+
"--n_instructions",
56+
type=int,
57+
required=False,
58+
)
59+
parser.add_argument(
60+
"--provide_explanation",
61+
action="store_true",
62+
help=(
63+
"If specified, judge will provide explanation before making a "
64+
"judgement. Does not necessarily improve the accuracy of the judge "
65+
"but enables some result interpretation."
66+
),
67+
)
68+
parser.add_argument(
69+
"--swap_mode",
70+
type=str,
71+
choices=["fixed", "both"],
72+
default="fixed",
73+
help=(
74+
"Model comparison order mode. 'fixed': always use model order A-B. "
75+
"'both': correct for model order bias by evaluating each instruction "
76+
"twice, once as A-B and once as B-A, and concatenating the results. "
77+
"This helps account for judge position bias. Default is 'fixed'."
78+
),
79+
)
80+
parser.add_argument(
81+
"--ignore_cache",
82+
action="store_true",
83+
help="If specified, ignore cache of previous completions.",
84+
)
85+
parser.add_argument(
86+
"--result_folder",
87+
type=str,
88+
required=False,
89+
default="results",
90+
help=(
91+
"The folder to save the results. Defaults to `results`. Evaluation "
92+
"results will be saved in `[result_folder]/[evaluation_name]`."
93+
),
94+
)
95+
parser.add_argument(
96+
"--truncate_all_input_chars",
97+
type=int,
98+
required=False,
99+
default=8192,
100+
help=(
101+
"Character-level truncation applied before tokenization: truncates "
102+
"each instruction before model A/B generation and truncates each "
103+
"completion before judge evaluation."
104+
),
105+
)
106+
parser.add_argument(
107+
"--max_out_tokens_models",
108+
type=int,
109+
required=False,
110+
default=32768,
111+
help=(
112+
"Generation token budget for each model A/B response. For VLLM, "
113+
"keep this <= --max_model_len (if provided)."
114+
),
115+
)
116+
parser.add_argument(
117+
"--max_out_tokens_judge",
118+
type=int,
119+
required=False,
120+
default=32768,
121+
help=(
122+
"Generation token budget for the judge response (reasoning + scores). "
123+
"For VLLM, keep this <= --max_model_len (if provided)."
124+
),
125+
)
126+
parser.add_argument(
127+
"--max_model_len",
128+
type=int,
129+
required=False,
130+
default=None,
131+
help=(
132+
"Optional total context window for VLLM models (prompt + generation). "
133+
"This is independent from --max_out_tokens_models/--max_out_tokens_judge, "
134+
"which only cap generated tokens. This is useful on smaller GPUs to "
135+
"avoid OOM."
136+
),
137+
)
138+
parser.add_argument(
139+
"--chat_template",
140+
type=str,
141+
required=False,
142+
default=None,
143+
help=(
144+
"Jinja2 chat template string to use instead of the model's tokenizer "
145+
"template. If not provided, ChatML is used as fallback for models "
146+
"without a chat template."
147+
),
148+
)
149+
parser.add_argument(
150+
"--engine_kwargs",
151+
type=str,
152+
required=False,
153+
default="{}",
154+
help=(
155+
"JSON dict of engine-specific kwargs forwarded to the underlying "
156+
"engine. Example for vLLM: "
157+
'\'{"tensor_parallel_size": 2, "gpu_memory_utilization": 0.9}\'.'
158+
),
159+
)
160+
161+
162+
def parse_engine_kwargs(raw: str) -> dict:
163+
"""Parse and validate a JSON string into an engine-kwargs dict."""
164+
try:
165+
engine_kwargs = json.loads(raw) if raw else {}
166+
if not isinstance(engine_kwargs, dict):
167+
raise ValueError("engine_kwargs must be a JSON object")
168+
except Exception as e:
169+
raise SystemExit(f"Failed to parse --engine_kwargs: {e}") from e
170+
return engine_kwargs

judgearena/config.py

Lines changed: 0 additions & 181 deletions
This file was deleted.

0 commit comments

Comments
 (0)