feat: concise success feedback and token usage tracking

beachdweller · claude · beachdweller · commit 5da463498354 · 2026-04-03T22:03:51.000+09:00
Shorten all-tests-passed feedback to 3-5 sentences (#2) and write token_usage.json to OUTPUT-DIR for post-run analysis (#5). Multi-provider extraction: Gemini, Claude, OpenAI-compatible formats. 11 new tests covering both features. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/entrypoint.py b/entrypoint.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 # begin entrypoint.py
 
+import json
 import logging
 import os
 import pathlib
 import sys
 
-from typing import Any, Dict, Tuple
+from typing import Any, Dict, Optional, Tuple
 
 
 sys.path.insert(
@@ -79,6 +80,69 @@ def main(b_ask:bool=True) -> None:
     elif b_fail_expected:
         assert n_failed > 0, 'No failed tests detected when failure was expected'
 
+    # Write token usage to artifact directory if available
+    output_dir = os.getenv('INPUT_OUTPUT-DIR', '')
+    if output_dir and b_ask:
+        write_token_usage(client, model, pathlib.Path(output_dir))
+
+
+def extract_token_usage(raw_response: Optional[dict]) -> Dict[str, Any]:
+    """Extract token usage from LLM API response (best-effort, multi-provider).
+
+    Different providers return usage in different structures:
+      Gemini:     usageMetadata.promptTokenCount / candidatesTokenCount
+      Claude:     usage.input_tokens / output_tokens
+      OpenAI-like: usage.prompt_tokens / completion_tokens (Grok, NVIDIA, Perplexity)
+
+    Returns dict with input_tokens, output_tokens, total_tokens (None if unavailable).
+    """
+    if not raw_response or not isinstance(raw_response, dict):
+        return {"input_tokens": None, "output_tokens": None, "total_tokens": None}
+
+    # Gemini format
+    usage = raw_response.get("usageMetadata", {})
+    if usage:
+        return {
+            "input_tokens": usage.get("promptTokenCount"),
+            "output_tokens": usage.get("candidatesTokenCount"),
+            "total_tokens": usage.get("totalTokenCount"),
+        }
+
+    # Claude / OpenAI-compatible format
+    usage = raw_response.get("usage", {})
+    if usage:
+        input_t = usage.get("input_tokens") or usage.get("prompt_tokens")
+        output_t = usage.get("output_tokens") or usage.get("completion_tokens")
+        total_t = usage.get("total_tokens")
+        if total_t is None and input_t is not None and output_t is not None:
+            total_t = input_t + output_t
+        return {
+            "input_tokens": input_t,
+            "output_tokens": output_t,
+            "total_tokens": total_t,
+        }
+
+    return {"input_tokens": None, "output_tokens": None, "total_tokens": None}
+
+
+def write_token_usage(
+    client: 'LLMAPIClient',
+    model: str,
+    output_dir: pathlib.Path,
+) -> None:
+    """Write token_usage.json to output directory."""
+    usage = extract_token_usage(client.last_raw_response)
+    usage["model"] = model
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    usage_path = output_dir / "token_usage.json"
+    try:
+        with open(usage_path, "w", encoding="utf-8") as f:
+            json.dump(usage, f, indent=2)
+        logging.info(f"Token usage written to {usage_path}: {usage}")
+    except OSError as e:
+        logging.warning(f"Could not write token usage: {e}")
+
 
 def get_startwith(key:str, dictionary:dict) -> Any:
     result = None
diff --git a/llm_client.py b/llm_client.py
@@ -50,6 +50,7 @@ def __init__(self, config: LLMConfig, retry_delay_sec: float = 5.0,
         self.max_retry_attempt = max_retry_attempt
         self.timeout_sec = timeout_sec
         self.logger = logging.getLogger(__name__)  # Logger for this module
+        self.last_raw_response = None  # Store last API response for token usage extraction
 
     def call_api(self, question: str) -> Optional[str]:
         """Send a question to the LLM API with retry and timeout handling.
@@ -99,6 +100,7 @@ def call_api(self, question: str) -> Optional[str]:
                 try:
                     # Parse JSON and extract response using config-specific method
                     result = response.json()
+                    self.last_raw_response = result
                     return self.config.parse_response(result)
                 except (ValueError, KeyError) as e:
                     # Log parsing errors (invalid JSON or unexpected structure)
diff --git a/prompt.py b/prompt.py
@@ -93,7 +93,11 @@ def get_initial_instruction(questions: List[str], language: str) -> str:
             )
         return (
             f"{guardrail}\n"
-            f"In {language}, please comment on the student code given the assignment instruction."
+            f"All tests passed. In {language}, in 3-5 sentences:\n"
+            "1. Briefly note what the student did well.\n"
+            "2. Suggest one specific improvement if applicable "
+            "(e.g., efficiency, readability, edge cases).\n"
+            "Do not repeat test results. Do not assign or fabricate scores."
         )
 
     prompt_list = (
diff --git a/tests/test_entrypoint.py b/tests/test_entrypoint.py
@@ -166,6 +166,100 @@ def test_get_model_key_from_env_with_api_key(monkeypatch):
     assert api_key == "test-api-key"
 
 
+class TestExtractTokenUsage:
+    """Tests for extract_token_usage multi-provider support."""
+
+    def test_gemini_format(self):
+        raw = {
+            "usageMetadata": {
+                "promptTokenCount": 150,
+                "candidatesTokenCount": 200,
+                "totalTokenCount": 350,
+            }
+        }
+        result = entrypoint.extract_token_usage(raw)
+        assert result["input_tokens"] == 150
+        assert result["output_tokens"] == 200
+        assert result["total_tokens"] == 350
+
+    def test_claude_format(self):
+        raw = {
+            "usage": {
+                "input_tokens": 100,
+                "output_tokens": 250,
+            }
+        }
+        result = entrypoint.extract_token_usage(raw)
+        assert result["input_tokens"] == 100
+        assert result["output_tokens"] == 250
+        assert result["total_tokens"] == 350  # computed
+
+    def test_openai_format(self):
+        raw = {
+            "usage": {
+                "prompt_tokens": 80,
+                "completion_tokens": 120,
+                "total_tokens": 200,
+            }
+        }
+        result = entrypoint.extract_token_usage(raw)
+        assert result["input_tokens"] == 80
+        assert result["output_tokens"] == 120
+        assert result["total_tokens"] == 200
+
+    def test_none_response(self):
+        result = entrypoint.extract_token_usage(None)
+        assert result["input_tokens"] is None
+        assert result["output_tokens"] is None
+
+    def test_empty_dict(self):
+        result = entrypoint.extract_token_usage({})
+        assert result["input_tokens"] is None
+
+    def test_missing_usage(self):
+        raw = {"id": "123", "choices": []}
+        result = entrypoint.extract_token_usage(raw)
+        assert result["input_tokens"] is None
+
+
+class TestWriteTokenUsage:
+    """Tests for write_token_usage file output."""
+
+    def test_writes_json(self, tmp_path):
+        class MockClient:
+            last_raw_response = {
+                "usageMetadata": {
+                    "promptTokenCount": 50,
+                    "candidatesTokenCount": 100,
+                    "totalTokenCount": 150,
+                }
+            }
+        entrypoint.write_token_usage(MockClient(), "gemini-2.5-flash", tmp_path)
+        usage_file = tmp_path / "token_usage.json"
+        assert usage_file.exists()
+        import json
+        data = json.loads(usage_file.read_text())
+        assert data["model"] == "gemini-2.5-flash"
+        assert data["input_tokens"] == 50
+        assert data["output_tokens"] == 100
+
+    def test_creates_directory(self, tmp_path):
+        nested = tmp_path / "sub" / "dir"
+        class MockClient:
+            last_raw_response = {}
+        entrypoint.write_token_usage(MockClient(), "claude", nested)
+        assert (nested / "token_usage.json").exists()
+
+    def test_handles_none_response(self, tmp_path):
+        class MockClient:
+            last_raw_response = None
+        entrypoint.write_token_usage(MockClient(), "grok", tmp_path)
+        import json
+        data = json.loads((tmp_path / "token_usage.json").read_text())
+        assert data["model"] == "grok"
+        assert data["input_tokens"] is None
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
 
diff --git a/tests/test_prompt.py b/tests/test_prompt.py
@@ -524,6 +524,63 @@ def test_collect_longrepr__compare_contents(collect_longrepr_result: List[str]):
     assert not missing_markers, f"Missing markers: {missing_markers}"
 
 
+@pytest.fixture
+def all_passing_report(tmp_path) -> pathlib.Path:
+    """Create a pytest JSON report where all tests pass."""
+    report = {
+        "tests": [
+            {"nodeid": "test_syntax::test_valid", "outcome": "passed",
+             "call": {"longrepr": None}},
+            {"nodeid": "test_results::test_calc_area", "outcome": "passed",
+             "call": {"longrepr": None}},
+        ]
+    }
+    path = tmp_path / "all_pass_report.json"
+    path.write_text(json.dumps(report))
+    return path
+
+
+def test_get_prompt__all_passing__concise_instruction(
+    all_passing_report: pathlib.Path,
+    sample_student_code_path: pathlib.Path,
+    sample_readme_path: pathlib.Path,
+):
+    """When all tests pass, the prompt should instruct concise feedback."""
+    n_failed, prompt_text = prompt.get_prompt(
+        report_paths=(all_passing_report,),
+        student_files=(sample_student_code_path,),
+        readme_file=sample_readme_path,
+        explanation_in="Korean",
+    )
+
+    assert n_failed == 0
+    # Should contain concise instructions
+    assert "3-5 sentences" in prompt_text
+    assert "Do not assign or fabricate scores" in prompt_text
+    # Should NOT contain the verbose "comment on the student code" instruction
+    assert "please comment on the student code" not in prompt_text
+
+
+def test_get_prompt__with_failures__has_directive(
+    sample_report_path: pathlib.Path,
+    sample_student_code_path: pathlib.Path,
+    sample_readme_path: pathlib.Path,
+):
+    """When tests fail, the prompt should contain the error directive."""
+    n_failed, prompt_text = prompt.get_prompt(
+        report_paths=(sample_report_path,),
+        student_files=(sample_student_code_path,),
+        readme_file=sample_readme_path,
+        explanation_in="Korean",
+    )
+
+    assert n_failed > 0
+    # Should contain failure-specific instruction
+    assert "mutually exclusively and collectively exhaustively" in prompt_text
+    # Should NOT contain the concise success instruction
+    assert "3-5 sentences" not in prompt_text
+
+
 if __name__ == '__main__':
     pytest.main([__file__])
 

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,11 @@ def get_initial_instruction(questions: List[str], language: str) -> str:`
`93`	`93`	`)`
`94`	`94`	`return (`
`95`	`95`	`f"{guardrail}\n"`
`96`		`- f"In {language}, please comment on the student code given the assignment instruction."`
	`96`	`+ f"All tests passed. In {language}, in 3-5 sentences:\n"`
	`97`	`+ "1. Briefly note what the student did well.\n"`
	`98`	`+ "2. Suggest one specific improvement if applicable "`
	`99`	`+ "(e.g., efficiency, readability, edge cases).\n"`
	`100`	`+ "Do not repeat test results. Do not assign or fabricate scores."`
`97`	`101`	`)`
`98`	`102`
`99`	`103`	`prompt_list = (`