Skip to content

Commit 642ec7c

Browse files
authored
test: unit tests for CLI decompose and eval pure-logic helpers (#861) (#863)
* test: unit tests for CLI decompose and eval pure-logic helpers (#861) 35 tests covering reorder_subtasks (topological sort + cycle detection), verify_user_variables (input var and dependency cross-validation), validate_filename (path-traversal-safe regex), and parse_judge_output (JSON-then-regex fallback scoring parser). * test: strengthen CLI unit tests per code review (#861) - test_parse_invalid_json_falls_back_to_regex: assert reason == output to verify the regex fallback returns the raw text as justification, not just the score - _make_test_eval: pass explicit test_id so to_dict assertion is non-trivial - test_reorder_invalid_dependency_ignored: also assert tag survives intact - test_reorder_case_insensitive_dependency: new test verifying tag lookups are case-normalised (depends_on=["A"] resolves tag "a") - test_valid_filename_max_length: boundary test for the 250-char upper limit * test: add return type annotations to CLI test helper functions (#861) Aligns with codebase convention (test_openai_compatible_helpers.py, test_granite32_input.py, test_unit_test_eval.py all annotate helpers).
1 parent 6825cb1 commit 642ec7c

2 files changed

Lines changed: 345 additions & 0 deletions

File tree

test/cli/test_decompose_unit.py

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
"""Unit tests for decompose pure-logic helpers — no backend, no file I/O required.
2+
3+
Covers reorder_subtasks, verify_user_variables, validate_filename.
4+
"""
5+
6+
import pytest
7+
8+
from cli.decompose.decompose import reorder_subtasks, verify_user_variables
9+
from cli.decompose.utils import validate_filename
10+
11+
# --- reorder_subtasks ---
12+
13+
14+
def _subtask(tag: str, subtask: str, depends_on: list[str] | None = None) -> dict:
15+
"""Minimal subtask dict for testing."""
16+
d = {
17+
"tag": tag,
18+
"subtask": subtask,
19+
"constraints": [],
20+
"prompt_template": "",
21+
"general_instructions": "",
22+
"input_vars_required": [],
23+
"depends_on": depends_on or [],
24+
}
25+
return d
26+
27+
28+
def test_reorder_no_dependencies():
29+
subtasks = [_subtask("a", "1. Task A"), _subtask("b", "2. Task B")]
30+
result = reorder_subtasks(subtasks)
31+
# No dependencies — all tasks present, order unconstrained
32+
assert {s["tag"] for s in result} == {"a", "b"}
33+
34+
35+
def test_reorder_respects_dependency():
36+
subtasks = [
37+
_subtask("b", "1. Task B", depends_on=["a"]),
38+
_subtask("a", "2. Task A"),
39+
]
40+
result = reorder_subtasks(subtasks)
41+
tags = [s["tag"] for s in result]
42+
assert tags.index("a") < tags.index("b")
43+
44+
45+
def test_reorder_chain_dependency():
46+
subtasks = [
47+
_subtask("c", "1. C", depends_on=["b"]),
48+
_subtask("b", "2. B", depends_on=["a"]),
49+
_subtask("a", "3. A"),
50+
]
51+
result = reorder_subtasks(subtasks)
52+
tags = [s["tag"] for s in result]
53+
assert tags == ["a", "b", "c"]
54+
55+
56+
def test_reorder_circular_raises():
57+
subtasks = [
58+
_subtask("a", "1. A", depends_on=["b"]),
59+
_subtask("b", "2. B", depends_on=["a"]),
60+
]
61+
with pytest.raises(ValueError, match="Circular dependency"):
62+
reorder_subtasks(subtasks)
63+
64+
65+
def test_reorder_renumbers_subtasks():
66+
subtasks = [
67+
_subtask("b", "2. Task B", depends_on=["a"]),
68+
_subtask("a", "1. Task A"),
69+
]
70+
result = reorder_subtasks(subtasks)
71+
# After reordering, numbering should be updated
72+
assert result[0]["subtask"].startswith("1. ")
73+
assert result[1]["subtask"].startswith("2. ")
74+
75+
76+
def test_reorder_invalid_dependency_ignored():
77+
subtasks = [_subtask("a", "1. A", depends_on=["nonexistent"])]
78+
result = reorder_subtasks(subtasks)
79+
assert len(result) == 1
80+
assert result[0]["tag"] == "a"
81+
82+
83+
def test_reorder_case_insensitive_dependency():
84+
# Tags and depends_on are lowercased before lookup — mixed case must resolve correctly
85+
subtasks = [
86+
_subtask("b", "1. Task B", depends_on=["A"]),
87+
_subtask("a", "2. Task A"),
88+
]
89+
result = reorder_subtasks(subtasks)
90+
tags = [s["tag"] for s in result]
91+
assert tags.index("a") < tags.index("b")
92+
93+
94+
# --- verify_user_variables ---
95+
96+
97+
def _decomp_data(subtasks: list[dict]) -> dict:
98+
return {
99+
"original_task_prompt": "",
100+
"subtask_list": [],
101+
"identified_constraints": [],
102+
"subtasks": subtasks,
103+
}
104+
105+
106+
def test_verify_valid_input_vars():
107+
data = _decomp_data([_subtask("a", "A", depends_on=[])])
108+
data["subtasks"][0]["input_vars_required"] = ["doc"]
109+
result = verify_user_variables(data, input_var=["doc"])
110+
assert result is data
111+
112+
113+
def test_verify_missing_input_var_raises():
114+
data = _decomp_data([_subtask("a", "A")])
115+
data["subtasks"][0]["input_vars_required"] = ["doc"]
116+
with pytest.raises(ValueError, match="requires input variable"):
117+
verify_user_variables(data, input_var=[])
118+
119+
120+
def test_verify_missing_dependency_raises():
121+
data = _decomp_data([_subtask("a", "A", depends_on=["nonexistent"])])
122+
with pytest.raises(ValueError, match="does not exist"):
123+
verify_user_variables(data, input_var=[])
124+
125+
126+
def test_verify_reorders_when_needed():
127+
data = _decomp_data(
128+
[_subtask("b", "1. B", depends_on=["a"]), _subtask("a", "2. A")]
129+
)
130+
result = verify_user_variables(data, input_var=None)
131+
tags = [s["tag"] for s in result["subtasks"]]
132+
assert tags.index("a") < tags.index("b")
133+
134+
135+
def test_verify_no_reorder_when_already_sorted():
136+
data = _decomp_data(
137+
[_subtask("a", "1. A"), _subtask("b", "2. B", depends_on=["a"])]
138+
)
139+
result = verify_user_variables(data, input_var=None)
140+
tags = [s["tag"] for s in result["subtasks"]]
141+
assert tags == ["a", "b"]
142+
143+
144+
def test_verify_none_input_var_treated_as_empty():
145+
data = _decomp_data([_subtask("a", "A")])
146+
result = verify_user_variables(data, input_var=None)
147+
assert result is data
148+
149+
150+
# --- validate_filename ---
151+
152+
153+
def test_valid_filename():
154+
assert validate_filename("my_output_file") is True
155+
156+
157+
def test_valid_filename_with_extension():
158+
assert validate_filename("results.json") is True
159+
160+
161+
def test_valid_filename_with_hyphen():
162+
assert validate_filename("my-output") is True
163+
164+
165+
def test_valid_filename_with_spaces():
166+
assert validate_filename("my output file") is True
167+
168+
169+
def test_invalid_filename_slash():
170+
assert validate_filename("path/to/file") is False
171+
172+
173+
def test_invalid_filename_empty():
174+
assert validate_filename("") is False
175+
176+
177+
def test_invalid_filename_single_char():
178+
# Pattern requires at least 2 chars (first char + rest)
179+
assert validate_filename("a") is False
180+
181+
182+
def test_invalid_filename_starts_with_hyphen():
183+
assert validate_filename("-badname") is False
184+
185+
186+
def test_valid_filename_starts_with_dot():
187+
assert validate_filename(".hidden_file") is True
188+
189+
190+
def test_invalid_filename_too_long():
191+
assert validate_filename("a" * 251) is False
192+
193+
194+
def test_valid_filename_max_length():
195+
assert validate_filename("a" * 250) is True
196+
197+
198+
if __name__ == "__main__":
199+
pytest.main([__file__, "-v"])

test/cli/test_eval_unit.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
"""Unit tests for eval runner pure-logic helpers — no backend, no model required.
2+
3+
Covers InputEvalResult, TestEvalResult, parse_judge_output.
4+
"""
5+
6+
import pytest
7+
8+
from cli.eval.runner import InputEvalResult, TestEvalResult, parse_judge_output
9+
from mellea.stdlib.components.unit_test_eval import TestBasedEval
10+
11+
# --- InputEvalResult ---
12+
13+
14+
def test_input_eval_result_to_dict():
15+
r = InputEvalResult(
16+
input_text="What is 2+2?",
17+
model_output="4",
18+
validation_passed=True,
19+
score=1,
20+
validation_reason="Correct answer",
21+
)
22+
d = r.to_dict()
23+
assert d["input"] == "What is 2+2?"
24+
assert d["model_output"] == "4"
25+
assert d["passed"] is True
26+
assert d["score"] == 1
27+
assert d["justification"] == "Correct answer"
28+
29+
30+
def test_input_eval_result_to_dict_failed():
31+
r = InputEvalResult("q", "wrong", False, 0, "Incorrect")
32+
d = r.to_dict()
33+
assert d["passed"] is False
34+
assert d["score"] == 0
35+
36+
37+
# --- TestEvalResult ---
38+
39+
40+
def _make_test_eval() -> TestBasedEval:
41+
return TestBasedEval(
42+
source="test_source",
43+
name="test_name",
44+
instructions="Judge if correct",
45+
inputs=["input1", "input2"],
46+
test_id="test-001",
47+
)
48+
49+
50+
def _make_input_results(passed: list[bool]) -> list[InputEvalResult]:
51+
return [
52+
InputEvalResult(f"q{i}", f"a{i}", p, 1 if p else 0, "reason")
53+
for i, p in enumerate(passed)
54+
]
55+
56+
57+
def test_test_eval_result_passed_count():
58+
eval_spec = _make_test_eval()
59+
results = _make_input_results([True, False])
60+
r = TestEvalResult(eval_spec, results)
61+
assert r.passed_count == 1
62+
63+
64+
def test_test_eval_result_pass_rate():
65+
eval_spec = _make_test_eval()
66+
results = _make_input_results([True, False])
67+
r = TestEvalResult(eval_spec, results)
68+
assert r.pass_rate == pytest.approx(0.5)
69+
70+
71+
def test_test_eval_result_pass_rate_empty():
72+
eval_spec = _make_test_eval()
73+
r = TestEvalResult(eval_spec, [])
74+
assert r.pass_rate == 0.0
75+
76+
77+
def test_test_eval_result_all_pass():
78+
eval_spec = _make_test_eval()
79+
results = _make_input_results([True, True])
80+
r = TestEvalResult(eval_spec, results)
81+
assert r.pass_rate == pytest.approx(1.0)
82+
83+
84+
def test_test_eval_result_to_dict_structure():
85+
eval_spec = _make_test_eval()
86+
results = _make_input_results([True, False])
87+
r = TestEvalResult(eval_spec, results)
88+
d = r.to_dict()
89+
assert d["test_id"] == "test-001"
90+
assert d["source"] == "test_source"
91+
assert d["name"] == "test_name"
92+
assert d["instructions"] == "Judge if correct"
93+
assert len(d["input_results"]) == 2
94+
assert d["passed"] == 1
95+
assert d["total_count"] == 2
96+
assert d["pass_rate"] == pytest.approx(0.5)
97+
98+
99+
# --- parse_judge_output ---
100+
101+
102+
def test_parse_json_score_and_justification():
103+
output = '{"score": 1, "justification": "Correct answer"}'
104+
score, reason = parse_judge_output(output)
105+
assert score == 1
106+
assert reason == "Correct answer"
107+
108+
109+
def test_parse_json_embedded_in_text():
110+
output = 'Based on my review: {"score": 0, "justification": "Wrong answer"} end.'
111+
score, reason = parse_judge_output(output)
112+
assert score == 0
113+
assert reason == "Wrong answer"
114+
115+
116+
def test_parse_score_from_plain_text():
117+
output = "Score: 1\nThe answer is correct."
118+
score, reason = parse_judge_output(output)
119+
assert score == 1
120+
assert reason == output
121+
122+
123+
def test_parse_no_score_returns_none():
124+
output = "I cannot determine the score."
125+
score, reason = parse_judge_output(output)
126+
assert score is None
127+
assert reason == output
128+
129+
130+
def test_parse_invalid_json_falls_back_to_regex():
131+
output = 'Almost JSON: {"score": 1, but broken}'
132+
score, reason = parse_judge_output(output)
133+
# Regex fallback should find "score": 1 and return the full raw text as justification
134+
assert score == 1
135+
assert reason == output
136+
137+
138+
def test_parse_zero_score():
139+
output = '{"score": 0, "justification": "Failed"}'
140+
score, reason = parse_judge_output(output)
141+
assert score == 0
142+
assert reason == "Failed"
143+
144+
145+
if __name__ == "__main__":
146+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)