Skip to content

Commit ab3db1b

Browse files
committed
address PR review: json schema structured outputs, tighten vllm range
- Switch from choice-based structured outputs (121 cartesian product entries) to JSON schema constraint via StructuredOutputsParams(json=...). This scales to multi-criteria evaluation without combinatorial explosion. - Tighten vllm version range from >=0.17.0,<1.0.0 to >=0.17.0,<0.19.0 (tested with 0.18.1 on cluster). - Update tests to validate JSON schema structure. Includes-AI-Code: true Made-with: Cursor
1 parent ef1c92c commit ab3db1b

5 files changed

Lines changed: 33 additions & 20 deletions

File tree

judgearena/evaluate.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,21 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
5656
_PAIR_SCORE_MAX = 10
5757

5858

59-
def build_pair_score_output_choices() -> list[str]:
60-
return [
61-
f"score_A: {a}\nscore_B: {b}"
62-
for a in range(_PAIR_SCORE_MIN, _PAIR_SCORE_MAX + 1)
63-
for b in range(_PAIR_SCORE_MIN, _PAIR_SCORE_MAX + 1)
64-
]
59+
def build_pair_score_json_schema() -> dict:
60+
score_field = {
61+
"type": "integer",
62+
"minimum": _PAIR_SCORE_MIN,
63+
"maximum": _PAIR_SCORE_MAX,
64+
}
65+
return {
66+
"type": "object",
67+
"properties": {
68+
"score_A": score_field,
69+
"score_B": score_field,
70+
},
71+
"required": ["score_A", "score_B"],
72+
"additionalProperties": False,
73+
}
6574

6675

6776
_COMPLETION_LABEL_SINGLE = "Answer"

judgearena/generate_and_evaluate.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import pandas as pd
1414

1515
from judgearena.evaluate import (
16-
build_pair_score_output_choices,
16+
build_pair_score_json_schema,
1717
judge_and_parse_prefs,
1818
resolve_judge_prompts,
1919
)
@@ -407,8 +407,8 @@ def main(args: CliArgs):
407407

408408
judge_model_kwargs = dict(args.engine_kwargs)
409409
if not args.provide_explanation and args.judge_model.split("/")[0] == "VLLM":
410-
judge_model_kwargs["structured_outputs_choice"] = (
411-
build_pair_score_output_choices()
410+
judge_model_kwargs["structured_outputs_json"] = (
411+
build_pair_score_json_schema()
412412
)
413413

414414
judge_chat_model = make_model(

judgearena/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,10 @@ def __init__(
236236
"temperature": float(vllm_kwargs.pop("temperature", 0.6)),
237237
"top_p": float(vllm_kwargs.pop("top_p", 0.95)),
238238
}
239-
structured_outputs_choice = vllm_kwargs.pop("structured_outputs_choice", None)
240-
if structured_outputs_choice is not None:
239+
structured_outputs_json = vllm_kwargs.pop("structured_outputs_json", None)
240+
if structured_outputs_json is not None:
241241
self._sampling_params_kwargs["structured_outputs"] = (
242-
StructuredOutputsParams(choice=structured_outputs_choice)
242+
StructuredOutputsParams(json=structured_outputs_json)
243243
)
244244
self.sampling_params = SamplingParams(**self._sampling_params_kwargs)
245245

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,5 +82,6 @@ indent-style = "space"
8282

8383
[project.optional-dependencies]
8484
# vLLM on PyPI pins transformers<5; optional extra matches that so `uv lock` can resolve.
85-
vllm = ["vllm>=0.17.0,<1.0.0", "transformers>=4.56.0,<5.0.0"]
85+
# Tested with vllm 0.18.1; StructuredOutputsParams(json=...) requires >= 0.17.
86+
vllm = ["vllm>=0.17.0,<0.19.0", "transformers>=4.56.0,<5.0.0"]
8687
llamacpp = ["llama-cpp-python>=0.3.0"]

tests/test_local_completion_loading.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@
66
from judgearena.generate_and_evaluate import main as main_generate_and_eval
77

88

9-
def test_build_pair_score_output_choices_covers_all_integer_pairs():
10-
choices = evaluate.build_pair_score_output_choices()
11-
12-
assert len(choices) == 121
13-
assert len(set(choices)) == 121
14-
assert "score_A: 0\nscore_B: 0" in choices
15-
assert "score_A: 10\nscore_B: 10" in choices
9+
def test_build_pair_score_json_schema_covers_valid_range():
10+
schema = evaluate.build_pair_score_json_schema()
11+
12+
assert schema["type"] == "object"
13+
assert set(schema["required"]) == {"score_A", "score_B"}
14+
for key in ("score_A", "score_B"):
15+
assert schema["properties"][key]["type"] == "integer"
16+
assert schema["properties"][key]["minimum"] == 0
17+
assert schema["properties"][key]["maximum"] == 10
18+
assert schema["additionalProperties"] is False
1619

1720

1821
def test_main_aligns_local_reference_by_instruction_index(tmp_path, monkeypatch):

0 commit comments

Comments
 (0)