Skip to content

Commit 2fc2447

Browse files
authored
feat(eval): cweval (#9)
* docs: CWEval evaluation setup guide * feat(eval): cweval * refactor: simplify code block extraction logic * fix: gemini comments
1 parent 380b359 commit 2fc2447

5 files changed

Lines changed: 88 additions & 4 deletions

File tree

README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,31 @@ python eval/main.py --task "purpcode/CyberSecEval-SCG" \
196196
</div>
197197
</details>
198198

199+
<details><summary><b>CWEval Evaluation Setup</b> <i>:: click to expand ::</i></summary>
200+
<div>
201+
202+
```bash
203+
# Download and setup CWEval repository for CWEval evaluation
204+
# Note: Run from purpcode directory, CWEval will be cloned as a sibling directory
205+
git clone https://github.com/Co1lin/CWEval.git ../CWEval
206+
207+
# Run CWEval evaluation (default setup)
208+
python eval/main.py --task "purpcode/CWEval" --model purpcode/purpcode-14b-rl
209+
210+
# Alternative: if CWEval is not at the same directory level as purpcode, please specify the custom path using --cweval_path parameter
211+
# Example (replace with your actual CWEval installation path):
212+
python eval/main.py --task "purpcode/CWEval" \
213+
--model purpcode/purpcode-14b-rl \
214+
--cweval_path ../CWEval
215+
216+
# Note: Generated files will be saved to the CWEval repository
217+
# purpcode only handles response generation; evaluation must be performed in the CWEval repository
218+
# Follow the CWEval README (https://github.com/Co1lin/CWEval/blob/main/README.md) for further evaluation steps
219+
```
220+
221+
</div>
222+
</details>
223+
199224
## Acknowledgements
200225

201226
- [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research

eval/cweval.py

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,60 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
5+
import json
6+
import os
67

8+
from datasets import load_dataset
79

8-
def evaluate_cweval(*args):
9-
pass # placeholder
10+
11+
def evaluate_cweval(generation_path, task, cweval_path=None):
12+
model = generation_path.split("/")[-1].split(".trimmed")[0]
13+
14+
current_dir = os.getcwd()
15+
generation_path = os.path.abspath(generation_path)
16+
17+
if cweval_path:
18+
os.chdir(cweval_path)
19+
else:
20+
os.chdir("..")
21+
os.chdir("CWEval")
22+
base_output_dir = os.path.join(os.getcwd(), "evals", model, "generated_0")
23+
24+
task_dataset = load_dataset(task)["test"]
25+
task_dict = {
26+
item["task_id"]: item["file_path"].replace("_task", "_raw")
27+
for item in task_dataset
28+
}
29+
30+
os.makedirs(base_output_dir, exist_ok=True)
31+
32+
with open(generation_path, "r") as f:
33+
data = [json.loads(line) for line in f]
34+
35+
for item in data:
36+
task_id = item["task_id"]
37+
file_path = task_dict.get(task_id)
38+
39+
if file_path and "messages" in item:
40+
assistant_content = None
41+
for message in item["messages"]:
42+
if message["role"] == "assistant":
43+
assistant_content = message["content"]
44+
break
45+
46+
if assistant_content:
47+
code_blocks = assistant_content.split("```")
48+
if len(code_blocks) >= 3:
49+
code_block_with_lang = code_blocks[1]
50+
if "\n" in code_block_with_lang:
51+
code_block = code_block_with_lang.split("\n", 1)[1].strip()
52+
else:
53+
code_block = code_block_with_lang.strip()
54+
55+
output_path = os.path.join(base_output_dir, file_path)
56+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
57+
58+
with open(output_path, "w") as f:
59+
f.write(code_block)
60+
61+
os.chdir(current_dir)

eval/eval_only.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def main(
1212
llm_judge: str = DEFAULT_LLM_JUDGE,
1313
reference_results_path: str = None,
1414
purplellama_path: str = None,
15+
cweval_path: str = None,
1516
):
1617
evaluate_main(
1718
task,
@@ -20,6 +21,7 @@ def main(
2021
llm_judge=llm_judge,
2122
reference_results_path=reference_results_path,
2223
purplellama_path=purplellama_path,
24+
cweval_path=cweval_path,
2325
)
2426

2527

eval/evaluate.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ def evaluate_main(
3939
llm_judge: str = None,
4040
reference_results_path: str = None,
4141
purplellama_path: str = None,
42+
cweval_path: str = None,
4243
):
4344
if oracle is None: # Guessing oracle
4445
print(f"Guessing oracle for task {task}...")
@@ -143,7 +144,9 @@ def evaluate_main(
143144
elif oracle == "cweval":
144145
from eval.cweval import evaluate_cweval
145146

146-
evaluate_cweval(generation_path=generation_path, task=task)
147+
evaluate_cweval(
148+
generation_path=generation_path, task=task, cweval_path=cweval_path
149+
)
147150
else:
148151
raise ValueError(f"Unknown oracle: {oracle}")
149152

eval/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def main(
1515
llm_judge: str = DEFAULT_LLM_JUDGE,
1616
reference_results_path: str = None,
1717
purplellama_path: str = None,
18+
cweval_path: str = None,
1819
tp: int = 1,
1920
transform_conversation: str = None,
2021
oracle: str = None,
@@ -43,6 +44,7 @@ def main(
4344
llm_judge=llm_judge,
4445
reference_results_path=reference_results_path,
4546
purplellama_path=purplellama_path,
47+
cweval_path=cweval_path,
4648
)
4749

4850

0 commit comments

Comments
 (0)