feat(eval): cyberseceval (#8)

zhewang2001 · web-flow · commit 380b3597e9d9 · 2025-08-07T15:59:34.000-05:00
* feat(eval): cyberseceval

* feat: add configurable purplellama_path parameter for cyberseceval

* docs: CyberSecEval SCG evaluation setup guide

* docs: refine the guide for CyberSecEval SCG

* fix: gemini comments

* fix: ganler comments
diff --git a/README.md b/README.md
@@ -174,6 +174,28 @@ python eval/main.py --task "purpcode/PHTest"             --model purpcode/purpco
 Notes:
 * `--oracle` for evaluating customized generation (default guessing from dataset).
 
+<details><summary><b>CyberSecEval SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
+<div>
+
+```bash
+# Download and setup PurpleLlama repository for CyberSecEval evaluation
+# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory
+git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama
+pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt
+
+# Run CyberSecEval SCG evaluation (default setup)
+python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
+
+# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter
+# Example (replace with your actual PurpleLlama installation path):
+python eval/main.py --task "purpcode/CyberSecEval-SCG" \
+                    --model purpcode/purpcode-14b-rl \
+                    --purplellama_path ../PurpleLlama
+```
+
+</div>
+</details>
+
 ## Acknowledgements
 
 - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research
diff --git a/eval/cyberseceval.py b/eval/cyberseceval.py
@@ -2,8 +2,75 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
+import json
+import os
+import subprocess
 
+from datasets import load_dataset
 
-def evaluate_cyberseceval(*args):
-    pass  # placeholder
+
+def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
+    output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json"))
+    task_dataset = load_dataset(task)["test"]
+    task_dict = {item["task_id"]: item for item in task_dataset}
+
+    with open(generation_path, "r") as f:
+        data = [json.loads(line) for line in f]
+
+    results = []
+    for idx, item in enumerate(data):
+        task_id = item["task_id"]
+        assistant_content = None
+        for msg in item["messages"]:
+            if msg["role"] == "assistant":
+                assistant_content = msg["content"]
+                break
+
+        if task_id in task_dict:
+            task_data = task_dict[task_id]
+            model = generation_path.split("/")[-1].split(".")[0]
+
+            result = {
+                "prompt_id": idx,
+                "pass_id": 0,
+                "test_case_prompt": task_data["test_case_prompt"],
+                "response": assistant_content,
+                "cwe_identifier": task_data["cwe_identifier"],
+                "language": task_data["language"],
+                "line_text": task_data["line_text"],
+                "origin_code": task_data["origin_code"],
+                "variant": task_data.get("variant", ""),
+                "rule": task_data.get("rule", ""),
+                "repo": task_data.get("repo", ""),
+                "model": model,
+            }
+            results.append(result)
+
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=4)
+
+    original_pythonpath = os.environ.get("PYTHONPATH", "")
+    current_dir = os.getcwd()
+
+    if purplellama_path:
+        os.chdir(purplellama_path)
+    else:
+        os.chdir("..")
+        os.chdir("PurpleLlama")
+    os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd()
+
+    stat_path = output_path.replace(".reformat.json", ".stat.json")
+
+    cmd = [
+        "python3",
+        "-m",
+        "CybersecurityBenchmarks.benchmark.run",
+        "--benchmark=instruct",
+        "--use-precomputed-responses",
+        f"--response-path={output_path}",
+        f"--stat-path={stat_path}",
+    ]
+
+    subprocess.run(cmd, check=True)
+    os.chdir(current_dir)
+    os.environ["PYTHONPATH"] = original_pythonpath
diff --git a/eval/eval_only.py b/eval/eval_only.py
@@ -11,13 +11,15 @@ def main(
     oracle: str = None,
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
+    purplellama_path: str = None,
 ):
     evaluate_main(
         task,
         generation_path,
         oracle=oracle,
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
+        purplellama_path=purplellama_path,
     )
 
 
diff --git a/eval/evaluate.py b/eval/evaluate.py
@@ -38,6 +38,7 @@ def evaluate_main(
     oracle: str = None,
     llm_judge: str = None,
     reference_results_path: str = None,
+    purplellama_path: str = None,
 ):
     if oracle is None:  # Guessing oracle
         print(f"Guessing oracle for task {task}...")
@@ -89,7 +90,11 @@ def evaluate_main(
     elif oracle == "cyberseceval":
         from eval.cyberseceval import evaluate_cyberseceval
 
-        evaluate_cyberseceval(generation_path=generation_path, task=task)
+        evaluate_cyberseceval(
+            generation_path=generation_path,
+            task=task,
+            purplellama_path=purplellama_path,
+        )
     elif oracle == "codeguru":
         from eval.oracles.secure_code_oracles import evaluate_secure_code_gen
 
diff --git a/eval/main.py b/eval/main.py
@@ -14,6 +14,7 @@ def main(
     model_id: str = None,
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
+    purplellama_path: str = None,
     tp: int = 1,
     transform_conversation: str = None,
     oracle: str = None,
@@ -41,6 +42,7 @@ def main(
         oracle=oracle,
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
+        purplellama_path=purplellama_path,
     )