Skip to content

Commit 380b359

Browse files
authored
feat(eval): cyberseceval (#8)
* feat(eval): cyberseceval * feat: add configurable purplellama_path parameter for cyberseceval * docs: CyberSecEval SCG evaluation setup guide * docs: refine the guide for CyberSecEval SCG * fix: gemini comments * fix: ganler comments
1 parent 2586f84 commit 380b359

5 files changed

Lines changed: 102 additions & 4 deletions

File tree

README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,28 @@ python eval/main.py --task "purpcode/PHTest" --model purpcode/purpco
174174
Notes:
175175
* `--oracle` for evaluating customized generation (default guessing from dataset).
176176

177+
<details><summary><b>CyberSecEval SCG Evaluation Setup</b> <i>:: click to expand ::</i></summary>
178+
<div>
179+
180+
```bash
181+
# Download and setup PurpleLlama repository for CyberSecEval evaluation
182+
# Note: Run from purpcode directory, PurpleLlama will be cloned as a sibling directory
183+
git clone https://github.com/meta-llama/PurpleLlama.git ../PurpleLlama
184+
pip install -r ../PurpleLlama/CybersecurityBenchmarks/requirements.txt
185+
186+
# Run CyberSecEval SCG evaluation (default setup)
187+
python eval/main.py --task "purpcode/CyberSecEval-SCG" --model purpcode/purpcode-14b-rl
188+
189+
# Alternative: if PurpleLlama is not at the same directory level as purpcode, please specify the custom path using --purplellama_path parameter
190+
# Example (replace with your actual PurpleLlama installation path):
191+
python eval/main.py --task "purpcode/CyberSecEval-SCG" \
192+
--model purpcode/purpcode-14b-rl \
193+
--purplellama_path ../PurpleLlama
194+
```
195+
196+
</div>
197+
</details>
198+
177199
## Acknowledgements
178200

179201
- [Amazon Nova AI Challenge](https://www.amazon.science/nova-ai-challenge) for funding our research

eval/cyberseceval.py

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,75 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5-
# TODO(@zhewang2001): Please refactor the corresponding code snippets and then upload it.
5+
import json
6+
import os
7+
import subprocess
68

9+
from datasets import load_dataset
710

8-
def evaluate_cyberseceval(*args):
9-
pass # placeholder
11+
12+
def evaluate_cyberseceval(generation_path, task, purplellama_path=None):
13+
output_path = os.path.abspath(generation_path.replace(".jsonl", ".reformat.json"))
14+
task_dataset = load_dataset(task)["test"]
15+
task_dict = {item["task_id"]: item for item in task_dataset}
16+
17+
with open(generation_path, "r") as f:
18+
data = [json.loads(line) for line in f]
19+
20+
results = []
21+
for idx, item in enumerate(data):
22+
task_id = item["task_id"]
23+
assistant_content = None
24+
for msg in item["messages"]:
25+
if msg["role"] == "assistant":
26+
assistant_content = msg["content"]
27+
break
28+
29+
if task_id in task_dict:
30+
task_data = task_dict[task_id]
31+
model = generation_path.split("/")[-1].split(".")[0]
32+
33+
result = {
34+
"prompt_id": idx,
35+
"pass_id": 0,
36+
"test_case_prompt": task_data["test_case_prompt"],
37+
"response": assistant_content,
38+
"cwe_identifier": task_data["cwe_identifier"],
39+
"language": task_data["language"],
40+
"line_text": task_data["line_text"],
41+
"origin_code": task_data["origin_code"],
42+
"variant": task_data.get("variant", ""),
43+
"rule": task_data.get("rule", ""),
44+
"repo": task_data.get("repo", ""),
45+
"model": model,
46+
}
47+
results.append(result)
48+
49+
with open(output_path, "w") as f:
50+
json.dump(results, f, indent=4)
51+
52+
original_pythonpath = os.environ.get("PYTHONPATH", "")
53+
current_dir = os.getcwd()
54+
55+
if purplellama_path:
56+
os.chdir(purplellama_path)
57+
else:
58+
os.chdir("..")
59+
os.chdir("PurpleLlama")
60+
os.environ["PYTHONPATH"] = original_pythonpath + os.pathsep + os.getcwd()
61+
62+
stat_path = output_path.replace(".reformat.json", ".stat.json")
63+
64+
cmd = [
65+
"python3",
66+
"-m",
67+
"CybersecurityBenchmarks.benchmark.run",
68+
"--benchmark=instruct",
69+
"--use-precomputed-responses",
70+
f"--response-path={output_path}",
71+
f"--stat-path={stat_path}",
72+
]
73+
74+
subprocess.run(cmd, check=True)
75+
os.chdir(current_dir)
76+
os.environ["PYTHONPATH"] = original_pythonpath

eval/eval_only.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@ def main(
1111
oracle: str = None,
1212
llm_judge: str = DEFAULT_LLM_JUDGE,
1313
reference_results_path: str = None,
14+
purplellama_path: str = None,
1415
):
1516
evaluate_main(
1617
task,
1718
generation_path,
1819
oracle=oracle,
1920
llm_judge=llm_judge,
2021
reference_results_path=reference_results_path,
22+
purplellama_path=purplellama_path,
2123
)
2224

2325

eval/evaluate.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def evaluate_main(
3838
oracle: str = None,
3939
llm_judge: str = None,
4040
reference_results_path: str = None,
41+
purplellama_path: str = None,
4142
):
4243
if oracle is None: # Guessing oracle
4344
print(f"Guessing oracle for task {task}...")
@@ -89,7 +90,11 @@ def evaluate_main(
8990
elif oracle == "cyberseceval":
9091
from eval.cyberseceval import evaluate_cyberseceval
9192

92-
evaluate_cyberseceval(generation_path=generation_path, task=task)
93+
evaluate_cyberseceval(
94+
generation_path=generation_path,
95+
task=task,
96+
purplellama_path=purplellama_path,
97+
)
9398
elif oracle == "codeguru":
9499
from eval.oracles.secure_code_oracles import evaluate_secure_code_gen
95100

eval/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def main(
1414
model_id: str = None,
1515
llm_judge: str = DEFAULT_LLM_JUDGE,
1616
reference_results_path: str = None,
17+
purplellama_path: str = None,
1718
tp: int = 1,
1819
transform_conversation: str = None,
1920
oracle: str = None,
@@ -41,6 +42,7 @@ def main(
4142
oracle=oracle,
4243
llm_judge=llm_judge,
4344
reference_results_path=reference_results_path,
45+
purplellama_path=purplellama_path,
4446
)
4547

4648

0 commit comments

Comments
 (0)