feat(eval): cweval (#9)

zhewang2001 · web-flow · commit 2fc24474be4c · 2025-08-07T17:05:57.000-05:00
* docs: CWEval evaluation setup guide

* feat(eval): cweval

* refactor: simplify code block extraction logic

* fix: gemini comments
diff --git a/README.md b/README.md
@@ -196,6 +196,31 @@ python eval/main.py --task "purpcode/CyberSecEval-SCG" \
 </div>
 </details>
 
+<details><summary><b>CWEval Evaluation Setup</b> <i>:: click to expand ::</i></summary>
+<div>
+
+```bash
+# Download and setup CWEval repository for CWEval evaluation
+# Note: Run from purpcode directory, CWEval will be cloned as a sibling directory
+git clone https://github.com/Co1lin/CWEval.git ../CWEval
+
+# Run CWEval evaluation (default setup)
+python eval/main.py --task "purpcode/CWEval" --model purpcode/purpcode-14b-rl
+
+# Alternative: if CWEval is not at the same directory level as purpcode, please specify the custom path using --cweval_path parameter
+# Example (replace with your actual CWEval installation path):
+python eval/main.py --task "purpcode/CWEval" \
+                    --model purpcode/purpcode-14b-rl \
+                    --cweval_path ../CWEval
+
+# Note: Generated files will be saved to the CWEval repository
+# purpcode only handles response generation; evaluation must be performed in the CWEval repository
+# Follow the CWEval README (https://github.com/Co1lin/CWEval/blob/main/README.md) for further evaluation steps
+```
+
+</div>
+</details>
+
 ## Acknowledgements
 
 - [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research
diff --git a/eval/cweval.py b/eval/cweval.py
@@ -2,8 +2,60 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
+import json
+import os
 
+from datasets import load_dataset
 
-def evaluate_cweval(*args):
-    pass  # placeholder
+
+def evaluate_cweval(generation_path, task, cweval_path=None):
+    model = generation_path.split("/")[-1].split(".trimmed")[0]
+
+    current_dir = os.getcwd()
+    generation_path = os.path.abspath(generation_path)
+
+    if cweval_path:
+        os.chdir(cweval_path)
+    else:
+        os.chdir("..")
+        os.chdir("CWEval")
+    base_output_dir = os.path.join(os.getcwd(), "evals", model, "generated_0")
+
+    task_dataset = load_dataset(task)["test"]
+    task_dict = {
+        item["task_id"]: item["file_path"].replace("_task", "_raw")
+        for item in task_dataset
+    }
+
+    os.makedirs(base_output_dir, exist_ok=True)
+
+    with open(generation_path, "r") as f:
+        data = [json.loads(line) for line in f]
+
+    for item in data:
+        task_id = item["task_id"]
+        file_path = task_dict.get(task_id)
+
+        if file_path and "messages" in item:
+            assistant_content = None
+            for message in item["messages"]:
+                if message["role"] == "assistant":
+                    assistant_content = message["content"]
+                    break
+
+            if assistant_content:
+                code_blocks = assistant_content.split("```")
+                if len(code_blocks) >= 3:
+                    code_block_with_lang = code_blocks[1]
+                    if "\n" in code_block_with_lang:
+                        code_block = code_block_with_lang.split("\n", 1)[1].strip()
+                    else:
+                        code_block = code_block_with_lang.strip()
+
+                    output_path = os.path.join(base_output_dir, file_path)
+                    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+
+                    with open(output_path, "w") as f:
+                        f.write(code_block)
+
+    os.chdir(current_dir)
diff --git a/eval/eval_only.py b/eval/eval_only.py
@@ -12,6 +12,7 @@ def main(
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
     purplellama_path: str = None,
+    cweval_path: str = None,
 ):
     evaluate_main(
         task,
@@ -20,6 +21,7 @@ def main(
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
         purplellama_path=purplellama_path,
+        cweval_path=cweval_path,
     )
 
 
diff --git a/eval/evaluate.py b/eval/evaluate.py
@@ -39,6 +39,7 @@ def evaluate_main(
     llm_judge: str = None,
     reference_results_path: str = None,
     purplellama_path: str = None,
+    cweval_path: str = None,
 ):
     if oracle is None:  # Guessing oracle
         print(f"Guessing oracle for task {task}...")
@@ -143,7 +144,9 @@ def evaluate_main(
     elif oracle == "cweval":
         from eval.cweval import evaluate_cweval
 
-        evaluate_cweval(generation_path=generation_path, task=task)
+        evaluate_cweval(
+            generation_path=generation_path, task=task, cweval_path=cweval_path
+        )
     else:
         raise ValueError(f"Unknown oracle: {oracle}")
 
diff --git a/eval/main.py b/eval/main.py
@@ -15,6 +15,7 @@ def main(
     llm_judge: str = DEFAULT_LLM_JUDGE,
     reference_results_path: str = None,
     purplellama_path: str = None,
+    cweval_path: str = None,
     tp: int = 1,
     transform_conversation: str = None,
     oracle: str = None,
@@ -43,6 +44,7 @@ def main(
         llm_judge=llm_judge,
         reference_results_path=reference_results_path,
         purplellama_path=purplellama_path,
+        cweval_path=cweval_path,
     )