check results of driver

Dando18 · Dando18 · commit 9d3b365d6a4e · 2025-05-08T18:22:23.000-04:00
diff --git a/test/test-cpu.bash b/test/test-cpu.bash
@@ -10,7 +10,7 @@ else
 fi
 
 # First, use the baseline implementations to mimic LLM outputs.
-python prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
+python3.11 prompts/create-serial-tests.py drivers/cpp/benchmarks prompts/generation-prompts.json serial-generations.json
 
 # make sure the model drivers are built
 cd drivers
@@ -19,7 +19,7 @@ make
 cd ..
 
 # Run the drivers using these generations
-python run-all.py \
+python3.11 run-all.py \
     ../serial-generations.json \
     --output results.json \
     --launch-configs launch-configs.json \
@@ -29,4 +29,16 @@ python run-all.py \
     ${PROBLEM_ARG} \
     --build-timeout 60 \
     --run-timeout 120 \
-    --log info
+    --log info
+
+
+# check results
+cd ..
+python3.11 test/validate-test-results.py \
+    --results drivers/results.json \
+    --problem $1 \
+    --expected-write 3 \
+    --expected-source-valid 3 \
+    --expected-build 2 \
+    --expected-run 2 \
+    --expected-correct 1
diff --git a/test/validate-test-results.py b/test/validate-test-results.py
@@ -0,0 +1,112 @@
+""" Checks if the expected test results are present in the output JSON file.
+    usage: python test/validate-test-results.py \
+        --results <results.json> \
+        --problem <problem_name> \
+        --expected-write <expected_write_count> \
+        --expected-source-valid <expected_source_valid_count> \
+        --expected-build <expected_build_count> \
+        --expected-run <expected_run_count> \
+        --expected-correct <expected_correct_count>
+"""
+from argparse import ArgumentParser
+import json
+from collections import Counter
+
+
+def parse_args():
+    parser = ArgumentParser(description="Validate test results.")
+    parser.add_argument(
+        "--results",
+        type=str,
+        required=True,
+        help="Path to the results JSON file.",
+    )
+    parser.add_argument(
+        "--problem",
+        type=str,
+        required=True,
+        help="Name of the problem to validate.",
+    )
+    parser.add_argument(
+        "--expected-write",
+        type=int,
+        required=True,
+        help="Expected number of write operations.",
+    )
+    parser.add_argument(
+        "--expected-source-valid",
+        type=int,
+        required=True,
+        help="Expected number of source valid operations.",
+    )
+    parser.add_argument(
+        "--expected-build",
+        type=int,
+        required=True,
+        help="Expected number of build operations.",
+    )
+    parser.add_argument(
+        "--expected-run",
+        type=int,
+        required=True,
+        help="Expected number of run operations.",
+    )
+    parser.add_argument(
+        "--expected-correct",
+        type=int,
+        required=True,
+        help="Expected number of correct operations.",
+    )
+
+    return parser.parse_args()
+
+
+def validate_outputs(outputs, expected_counts):
+    actual_counts = Counter()
+
+    for output in outputs:
+        if output.get("source_write_success", False):
+            actual_counts["write"] += 1
+        if output.get("is_source_valid", False):
+            actual_counts["source_valid"] += 1
+        if output.get("did_build", False):
+            actual_counts["build"] += 1
+        if output.get("did_all_run", False):
+            actual_counts["run"] += 1
+        if output.get("are_all_valid", False):
+            actual_counts["correct"] += 1
+
+    for key, expected in expected_counts.items():
+        actual = actual_counts[key]
+        if actual != expected:
+            print(f"Expected {expected} for {key}, but got {actual}.")
+            return False
+    return True
+
+
+def main():
+    args = parse_args()
+
+    # Load the results JSON file
+    with open(args.results, "r") as f:
+        results = json.load(f)
+
+    # Validate the results
+    expected_counts = {
+        "write": args.expected_write,
+        "source_valid": args.expected_source_valid,
+        "build": args.expected_build,
+        "run": args.expected_run,
+        "correct": args.expected_correct,
+    }
+
+    results = [r for r in results if r["name"] == args.problem][0]
+
+    if not validate_outputs(results["outputs"], expected_counts):
+        print(f"Validation failed for problem {args.problem}.")
+        return 1
+
+
+if __name__ == "__main__":
+    main()
+