Skip to content

Commit 2fba9ba

Browse files
authored
Merge pull request #30 from sacredvoid/fix/parse-results-bool-filter
fix(eval): exclude booleans from parsed benchmark metrics
2 parents 5a167c3 + 7df0024 commit 2fba9ba

2 files changed

Lines changed: 19 additions & 1 deletion

File tree

src/alignrl/eval.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ def parse_results(raw: dict[str, Any], model_name: str, stage: str) -> EvalResul
2828
"""Parse lm-evaluation-harness output into EvalResult."""
2929
benchmarks: dict[str, dict[str, float]] = {}
3030
for task_name, metrics in raw.get("results", {}).items():
31-
benchmarks[task_name] = {k: v for k, v in metrics.items() if isinstance(v, (int, float))}
31+
benchmarks[task_name] = {
32+
k: v for k, v in metrics.items()
33+
if isinstance(v, (int, float)) and not isinstance(v, bool)
34+
}
3235
return EvalResult(model_name=model_name, stage=stage, benchmarks=benchmarks)
3336

3437

tests/test_eval.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,18 @@ def test_filters_non_numeric(self) -> None:
171171
def test_no_results_key(self) -> None:
172172
result = parse_results({}, model_name="test", stage="base")
173173
assert result.benchmarks == {}
174+
175+
def test_filters_booleans(self) -> None:
176+
raw = {
177+
"results": {
178+
"gsm8k": {
179+
"exact_match": 0.5,
180+
"has_config": True,
181+
"is_valid": False,
182+
}
183+
}
184+
}
185+
result = parse_results(raw, model_name="test", stage="base")
186+
assert "exact_match" in result.benchmarks["gsm8k"]
187+
assert "has_config" not in result.benchmarks["gsm8k"]
188+
assert "is_valid" not in result.benchmarks["gsm8k"]

0 commit comments

Comments
 (0)