test: unit tests for CLI decompose and eval pure-logic helpers (#861) (#863)

planetf1 · web-flow · commit 642ec7cf530f · 2026-04-15T12:55:38.000Z
* test: unit tests for CLI decompose and eval pure-logic helpers (#861) 35 tests covering reorder_subtasks (topological sort + cycle detection), verify_user_variables (input var and dependency cross-validation), validate_filename (path-traversal-safe regex), and parse_judge_output (JSON-then-regex fallback scoring parser). * test: strengthen CLI unit tests per code review (#861) - test_parse_invalid_json_falls_back_to_regex: assert reason == output to verify the regex fallback returns the raw text as justification, not just the score - _make_test_eval: pass explicit test_id so to_dict assertion is non-trivial - test_reorder_invalid_dependency_ignored: also assert tag survives intact - test_reorder_case_insensitive_dependency: new test verifying tag lookups are case-normalised (depends_on=["A"] resolves tag "a") - test_valid_filename_max_length: boundary test for the 250-char upper limit * test: add return type annotations to CLI test helper functions (#861) Aligns with codebase convention (test_openai_compatible_helpers.py, test_granite32_input.py, test_unit_test_eval.py all annotate helpers).
diff --git a/test/cli/test_decompose_unit.py b/test/cli/test_decompose_unit.py
@@ -0,0 +1,199 @@
+"""Unit tests for decompose pure-logic helpers — no backend, no file I/O required.
+
+Covers reorder_subtasks, verify_user_variables, validate_filename.
+"""
+
+import pytest
+
+from cli.decompose.decompose import reorder_subtasks, verify_user_variables
+from cli.decompose.utils import validate_filename
+
+# --- reorder_subtasks ---
+
+
+def _subtask(tag: str, subtask: str, depends_on: list[str] | None = None) -> dict:
+    """Minimal subtask dict for testing."""
+    d = {
+        "tag": tag,
+        "subtask": subtask,
+        "constraints": [],
+        "prompt_template": "",
+        "general_instructions": "",
+        "input_vars_required": [],
+        "depends_on": depends_on or [],
+    }
+    return d
+
+
+def test_reorder_no_dependencies():
+    subtasks = [_subtask("a", "1. Task A"), _subtask("b", "2. Task B")]
+    result = reorder_subtasks(subtasks)
+    # No dependencies — all tasks present, order unconstrained
+    assert {s["tag"] for s in result} == {"a", "b"}
+
+
+def test_reorder_respects_dependency():
+    subtasks = [
+        _subtask("b", "1. Task B", depends_on=["a"]),
+        _subtask("a", "2. Task A"),
+    ]
+    result = reorder_subtasks(subtasks)
+    tags = [s["tag"] for s in result]
+    assert tags.index("a") < tags.index("b")
+
+
+def test_reorder_chain_dependency():
+    subtasks = [
+        _subtask("c", "1. C", depends_on=["b"]),
+        _subtask("b", "2. B", depends_on=["a"]),
+        _subtask("a", "3. A"),
+    ]
+    result = reorder_subtasks(subtasks)
+    tags = [s["tag"] for s in result]
+    assert tags == ["a", "b", "c"]
+
+
+def test_reorder_circular_raises():
+    subtasks = [
+        _subtask("a", "1. A", depends_on=["b"]),
+        _subtask("b", "2. B", depends_on=["a"]),
+    ]
+    with pytest.raises(ValueError, match="Circular dependency"):
+        reorder_subtasks(subtasks)
+
+
+def test_reorder_renumbers_subtasks():
+    subtasks = [
+        _subtask("b", "2. Task B", depends_on=["a"]),
+        _subtask("a", "1. Task A"),
+    ]
+    result = reorder_subtasks(subtasks)
+    # After reordering, numbering should be updated
+    assert result[0]["subtask"].startswith("1. ")
+    assert result[1]["subtask"].startswith("2. ")
+
+
+def test_reorder_invalid_dependency_ignored():
+    subtasks = [_subtask("a", "1. A", depends_on=["nonexistent"])]
+    result = reorder_subtasks(subtasks)
+    assert len(result) == 1
+    assert result[0]["tag"] == "a"
+
+
+def test_reorder_case_insensitive_dependency():
+    # Tags and depends_on are lowercased before lookup — mixed case must resolve correctly
+    subtasks = [
+        _subtask("b", "1. Task B", depends_on=["A"]),
+        _subtask("a", "2. Task A"),
+    ]
+    result = reorder_subtasks(subtasks)
+    tags = [s["tag"] for s in result]
+    assert tags.index("a") < tags.index("b")
+
+
+# --- verify_user_variables ---
+
+
+def _decomp_data(subtasks: list[dict]) -> dict:
+    return {
+        "original_task_prompt": "",
+        "subtask_list": [],
+        "identified_constraints": [],
+        "subtasks": subtasks,
+    }
+
+
+def test_verify_valid_input_vars():
+    data = _decomp_data([_subtask("a", "A", depends_on=[])])
+    data["subtasks"][0]["input_vars_required"] = ["doc"]
+    result = verify_user_variables(data, input_var=["doc"])
+    assert result is data
+
+
+def test_verify_missing_input_var_raises():
+    data = _decomp_data([_subtask("a", "A")])
+    data["subtasks"][0]["input_vars_required"] = ["doc"]
+    with pytest.raises(ValueError, match="requires input variable"):
+        verify_user_variables(data, input_var=[])
+
+
+def test_verify_missing_dependency_raises():
+    data = _decomp_data([_subtask("a", "A", depends_on=["nonexistent"])])
+    with pytest.raises(ValueError, match="does not exist"):
+        verify_user_variables(data, input_var=[])
+
+
+def test_verify_reorders_when_needed():
+    data = _decomp_data(
+        [_subtask("b", "1. B", depends_on=["a"]), _subtask("a", "2. A")]
+    )
+    result = verify_user_variables(data, input_var=None)
+    tags = [s["tag"] for s in result["subtasks"]]
+    assert tags.index("a") < tags.index("b")
+
+
+def test_verify_no_reorder_when_already_sorted():
+    data = _decomp_data(
+        [_subtask("a", "1. A"), _subtask("b", "2. B", depends_on=["a"])]
+    )
+    result = verify_user_variables(data, input_var=None)
+    tags = [s["tag"] for s in result["subtasks"]]
+    assert tags == ["a", "b"]
+
+
+def test_verify_none_input_var_treated_as_empty():
+    data = _decomp_data([_subtask("a", "A")])
+    result = verify_user_variables(data, input_var=None)
+    assert result is data
+
+
+# --- validate_filename ---
+
+
+def test_valid_filename():
+    assert validate_filename("my_output_file") is True
+
+
+def test_valid_filename_with_extension():
+    assert validate_filename("results.json") is True
+
+
+def test_valid_filename_with_hyphen():
+    assert validate_filename("my-output") is True
+
+
+def test_valid_filename_with_spaces():
+    assert validate_filename("my output file") is True
+
+
+def test_invalid_filename_slash():
+    assert validate_filename("path/to/file") is False
+
+
+def test_invalid_filename_empty():
+    assert validate_filename("") is False
+
+
+def test_invalid_filename_single_char():
+    # Pattern requires at least 2 chars (first char + rest)
+    assert validate_filename("a") is False
+
+
+def test_invalid_filename_starts_with_hyphen():
+    assert validate_filename("-badname") is False
+
+
+def test_valid_filename_starts_with_dot():
+    assert validate_filename(".hidden_file") is True
+
+
+def test_invalid_filename_too_long():
+    assert validate_filename("a" * 251) is False
+
+
+def test_valid_filename_max_length():
+    assert validate_filename("a" * 250) is True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/test/cli/test_eval_unit.py b/test/cli/test_eval_unit.py
@@ -0,0 +1,146 @@
+"""Unit tests for eval runner pure-logic helpers — no backend, no model required.
+
+Covers InputEvalResult, TestEvalResult, parse_judge_output.
+"""
+
+import pytest
+
+from cli.eval.runner import InputEvalResult, TestEvalResult, parse_judge_output
+from mellea.stdlib.components.unit_test_eval import TestBasedEval
+
+# --- InputEvalResult ---
+
+
+def test_input_eval_result_to_dict():
+    r = InputEvalResult(
+        input_text="What is 2+2?",
+        model_output="4",
+        validation_passed=True,
+        score=1,
+        validation_reason="Correct answer",
+    )
+    d = r.to_dict()
+    assert d["input"] == "What is 2+2?"
+    assert d["model_output"] == "4"
+    assert d["passed"] is True
+    assert d["score"] == 1
+    assert d["justification"] == "Correct answer"
+
+
+def test_input_eval_result_to_dict_failed():
+    r = InputEvalResult("q", "wrong", False, 0, "Incorrect")
+    d = r.to_dict()
+    assert d["passed"] is False
+    assert d["score"] == 0
+
+
+# --- TestEvalResult ---
+
+
+def _make_test_eval() -> TestBasedEval:
+    return TestBasedEval(
+        source="test_source",
+        name="test_name",
+        instructions="Judge if correct",
+        inputs=["input1", "input2"],
+        test_id="test-001",
+    )
+
+
+def _make_input_results(passed: list[bool]) -> list[InputEvalResult]:
+    return [
+        InputEvalResult(f"q{i}", f"a{i}", p, 1 if p else 0, "reason")
+        for i, p in enumerate(passed)
+    ]
+
+
+def test_test_eval_result_passed_count():
+    eval_spec = _make_test_eval()
+    results = _make_input_results([True, False])
+    r = TestEvalResult(eval_spec, results)
+    assert r.passed_count == 1
+
+
+def test_test_eval_result_pass_rate():
+    eval_spec = _make_test_eval()
+    results = _make_input_results([True, False])
+    r = TestEvalResult(eval_spec, results)
+    assert r.pass_rate == pytest.approx(0.5)
+
+
+def test_test_eval_result_pass_rate_empty():
+    eval_spec = _make_test_eval()
+    r = TestEvalResult(eval_spec, [])
+    assert r.pass_rate == 0.0
+
+
+def test_test_eval_result_all_pass():
+    eval_spec = _make_test_eval()
+    results = _make_input_results([True, True])
+    r = TestEvalResult(eval_spec, results)
+    assert r.pass_rate == pytest.approx(1.0)
+
+
+def test_test_eval_result_to_dict_structure():
+    eval_spec = _make_test_eval()
+    results = _make_input_results([True, False])
+    r = TestEvalResult(eval_spec, results)
+    d = r.to_dict()
+    assert d["test_id"] == "test-001"
+    assert d["source"] == "test_source"
+    assert d["name"] == "test_name"
+    assert d["instructions"] == "Judge if correct"
+    assert len(d["input_results"]) == 2
+    assert d["passed"] == 1
+    assert d["total_count"] == 2
+    assert d["pass_rate"] == pytest.approx(0.5)
+
+
+# --- parse_judge_output ---
+
+
+def test_parse_json_score_and_justification():
+    output = '{"score": 1, "justification": "Correct answer"}'
+    score, reason = parse_judge_output(output)
+    assert score == 1
+    assert reason == "Correct answer"
+
+
+def test_parse_json_embedded_in_text():
+    output = 'Based on my review: {"score": 0, "justification": "Wrong answer"} end.'
+    score, reason = parse_judge_output(output)
+    assert score == 0
+    assert reason == "Wrong answer"
+
+
+def test_parse_score_from_plain_text():
+    output = "Score: 1\nThe answer is correct."
+    score, reason = parse_judge_output(output)
+    assert score == 1
+    assert reason == output
+
+
+def test_parse_no_score_returns_none():
+    output = "I cannot determine the score."
+    score, reason = parse_judge_output(output)
+    assert score is None
+    assert reason == output
+
+
+def test_parse_invalid_json_falls_back_to_regex():
+    output = 'Almost JSON: {"score": 1, but broken}'
+    score, reason = parse_judge_output(output)
+    # Regex fallback should find "score": 1 and return the full raw text as justification
+    assert score == 1
+    assert reason == output
+
+
+def test_parse_zero_score():
+    output = '{"score": 0, "justification": "Failed"}'
+    score, reason = parse_judge_output(output)
+    assert score == 0
+    assert reason == "Failed"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])