Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 4 additions & 100 deletions scripts/extract-lockfile-prompts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,106 +5,10 @@
# input-dir: directory containing .lock.yml files (default: .github/workflows)
# output-dir: where to write extracted .prompt.md files (default: /tmp/prompt-audit)
#
# Each lockfile's prompt is assembled from heredoc blocks (our content) and
# cat "/opt/gh-aw/prompts/*.md" runtime includes (platform content). This script
# extracts the heredoc content and marks runtime includes as placeholders.
# Wrapper around the Python extractor. Keep this shell entrypoint because
# workflows call this script directly.

set -euo pipefail

INPUT_DIR="${1:-.github/workflows}"
OUTPUT_DIR="${2:-/tmp/prompt-audit}"

mkdir -p "$OUTPUT_DIR"

count=0
for lockfile in "$INPUT_DIR"/gh-aw-*.lock.yml; do
[ -f "$lockfile" ] || continue

# Derive workflow name from filename: gh-aw-foo-bar.lock.yml → foo-bar
basename=$(basename "$lockfile")
name="${basename#gh-aw-}"
name="${name%.lock.yml}"

outfile="$OUTPUT_DIR/${name}.prompt.md"

# State machine to extract prompt content from the "Create prompt" step.
# States: 0=scanning, 1=in prompt block, 2=in heredoc content
awk '
BEGIN { state = 0 }

# Find the opening brace of the prompt assembly block
state == 0 && /^[ \t]*\{$/ && saw_create_prompt {
state = 1
next
}

# Track that we have seen the "Create prompt" step
/Create prompt with built-in context/ {
saw_create_prompt = 1
next
}

# End of prompt block
state >= 1 && /\} > "\$GH_AW_PROMPT"/ {
exit
}

# Runtime file include → placeholder
state == 1 && /cat "\/opt\/gh-aw\/prompts\// {
# Extract filename: cat "/opt/gh-aw/prompts/foo.md" → foo.md
s = $0
sub(/.*cat "\/opt\/gh-aw\/prompts\//, "", s)
sub(/".*/, "", s)
if (s != "") {
print "<!-- [RUNTIME INCLUDE: " s "] -->"
print ""
}
next
}

# Start of heredoc block
state == 1 && /cat << .GH_AW_PROMPT_EOF./ {
state = 2
next
}

# End of heredoc block
state == 2 && /^[ \t]*GH_AW_PROMPT_EOF[ \t]*$/ {
state = 1
next
}

# Content inside heredoc — strip leading whitespace (lockfile indents with 10 spaces)
state == 2 {
sub(/^ /, "")
print
}
' "$lockfile" > "$outfile"

# Skip empty extractions (backwards-compat wrapper files, etc.)
if [ ! -s "$outfile" ]; then
rm -f "$outfile"
continue
fi

count=$((count + 1))
done

# Write a manifest listing all extracted files with line counts
{
echo "# Prompt Audit Manifest"
echo ""
echo "Extracted prompt text from $count lockfiles in \`$INPUT_DIR/\`."
echo ""
echo "| Workflow | Lines | File |"
echo "| --- | --- | --- |"
for f in "$OUTPUT_DIR"/*.prompt.md; do
[ -f "$f" ] || continue
base=$(basename "$f" .prompt.md)
lines=$(wc -l < "$f" | tr -d ' ')
echo "| $base | $lines | \`$f\` |"
done
} > "$OUTPUT_DIR/README.md"

echo "Extracted prompts from $count lockfiles → $OUTPUT_DIR/"
ls -la "$OUTPUT_DIR/"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
exec python3 "$SCRIPT_DIR/extract_lockfile_prompts.py" "$@"
152 changes: 152 additions & 0 deletions scripts/extract_lockfile_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
#!/usr/bin/env python3
"""Extract compiled prompts from gh-aw lockfiles using YAML parsing."""

from __future__ import annotations

import argparse
import sys
from pathlib import Path
import re

import yaml


CREATE_PROMPT_STEP_NAME = "Create prompt with built-in context"
RUNTIME_INCLUDE_RE = re.compile(
r'^\s*cat\s+["\']?(?P<path>[^"\']*/gh-aw/prompts/[^"\']+)["\']?\s*$'
)
HEREDOC_START_RE = re.compile(
r'^\s*cat\s*<<-?\s*["\']?(?P<marker>[A-Za-z0-9_]+)["\']?\s*$'
)


def extract_prompts_from_run(run_script: str) -> str:
"""Extract heredoc content and runtime include placeholders from a run script."""
lines: list[str] = []
heredoc_end_marker: str | None = None

for line in run_script.splitlines():
if heredoc_end_marker is not None:
if line.strip() == heredoc_end_marker:
heredoc_end_marker = None
continue
lines.append(line)
continue

include_match = RUNTIME_INCLUDE_RE.match(line)
if include_match:
include_name = Path(include_match.group("path")).name
lines.append(f"<!-- [RUNTIME INCLUDE: {include_name}] -->")
lines.append("")
continue

heredoc_match = HEREDOC_START_RE.match(line)
if heredoc_match:
heredoc_end_marker = heredoc_match.group("marker")
continue

extracted = "\n".join(lines).rstrip()
if not extracted:
return ""
return f"{extracted}\n"


def extract_lockfile_prompt(lockfile_path: Path) -> str:
"""Extract prompt content from a single lockfile's Create prompt step."""
with lockfile_path.open("r", encoding="utf-8") as lockfile:
data = yaml.safe_load(lockfile)

if not isinstance(data, dict):
return ""

jobs = data.get("jobs")
if not isinstance(jobs, dict):
return ""

for job in jobs.values():
if not isinstance(job, dict):
continue
steps = job.get("steps")
if not isinstance(steps, list):
continue
for step in steps:
if not isinstance(step, dict):
continue
step_name = step.get("name")
if not isinstance(step_name, str):
continue
if CREATE_PROMPT_STEP_NAME not in step_name:
continue
run_script = step.get("run")
if not isinstance(run_script, str):
continue
return extract_prompts_from_run(run_script)

return ""


def write_manifest(output_dir: Path, input_dir: Path, extracted_files: list[Path]) -> None:
lines = [
"# Prompt Audit Manifest",
"",
f"Extracted prompt text from {len(extracted_files)} lockfiles in `{input_dir}/`.",
"",
"| Workflow | Lines | File |",
"| --- | --- | --- |",
]

for prompt_file in extracted_files:
workflow_name = prompt_file.name.removesuffix(".prompt.md")
line_count = len(prompt_file.read_text(encoding="utf-8").splitlines())
lines.append(f"| {workflow_name} | {line_count} | `{prompt_file}` |")

(output_dir / "README.md").write_text("\n".join(lines) + "\n", encoding="utf-8")


def parse_args(argv: list[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Extract agent prompt text from gh-aw .lock.yml files."
)
parser.add_argument(
"input_dir",
nargs="?",
default=".github/workflows",
help="Directory containing gh-aw-*.lock.yml files.",
)
parser.add_argument(
"output_dir",
nargs="?",
default="/tmp/prompt-audit",
help="Directory where extracted *.prompt.md files will be written.",
)
return parser.parse_args(argv)


def main(argv: list[str]) -> int:
args = parse_args(argv)
input_dir = Path(args.input_dir)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)

extracted_files: list[Path] = []
for lockfile_path in sorted(input_dir.glob("gh-aw-*.lock.yml")):
workflow_name = lockfile_path.name.removeprefix("gh-aw-").removesuffix(".lock.yml")
output_file = output_dir / f"{workflow_name}.prompt.md"
prompt_text = extract_lockfile_prompt(lockfile_path)

if prompt_text:
output_file.write_text(prompt_text, encoding="utf-8")
extracted_files.append(output_file)
elif output_file.exists():
output_file.unlink()

write_manifest(output_dir, input_dir, extracted_files)

print(f"Extracted prompts from {len(extracted_files)} lockfiles -> {output_dir}/")
for path in sorted(output_dir.glob("*")):
print(path)
return 0


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
105 changes: 105 additions & 0 deletions tests/test_extract_lockfile_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import importlib.util
import subprocess
import sys
from pathlib import Path

import yaml


SCRIPT_PATH = (
Path(__file__).resolve().parent.parent / "scripts" / "extract_lockfile_prompts.py"
)
CURRENT_LOCKFILE = (
Path(__file__).resolve().parent.parent
/ ".github"
/ "workflows"
/ "gh-aw-framework-best-practices.lock.yml"
)


def _load_module():
spec = importlib.util.spec_from_file_location("extract_lockfile_prompts", SCRIPT_PATH)
assert spec and spec.loader
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


def _run_script(input_dir: Path, output_dir: Path) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[sys.executable, str(SCRIPT_PATH), str(input_dir), str(output_dir)],
capture_output=True,
text=True,
check=False,
)


def test_extract_prompts_from_run_supports_legacy_markers():
module = _load_module()
run_script = """
{
cat "/opt/gh-aw/prompts/base.md"
cat << 'GH_AW_PROMPT_EOF'
hello
world
GH_AW_PROMPT_EOF
} > "$GH_AW_PROMPT"
""".strip()

extracted = module.extract_prompts_from_run(run_script)

assert "<!-- [RUNTIME INCLUDE: base.md] -->" in extracted
assert "hello" in extracted
assert "world" in extracted


def test_cli_extracts_current_lockfile_prompt_block(tmp_path):
input_dir = tmp_path / "input"
output_dir = tmp_path / "output"
input_dir.mkdir()
output_dir.mkdir()

lockfile_copy = input_dir / CURRENT_LOCKFILE.name
lockfile_copy.write_text(CURRENT_LOCKFILE.read_text(encoding="utf-8"), encoding="utf-8")

result = _run_script(input_dir, output_dir)

assert result.returncode == 0
extracted_file = output_dir / "framework-best-practices.prompt.md"
assert extracted_file.exists()

extracted = extracted_file.read_text(encoding="utf-8")
assert "<!-- [RUNTIME INCLUDE: xpia.md] -->" in extracted
assert "<safe-output-tools>" in extracted
assert "</system>" in extracted

manifest = (output_dir / "README.md").read_text(encoding="utf-8")
assert "Extracted prompt text from 1 lockfiles" in manifest
assert "framework-best-practices" in manifest


def test_cli_skips_lockfile_without_create_prompt_step(tmp_path):
input_dir = tmp_path / "input"
output_dir = tmp_path / "output"
input_dir.mkdir()
output_dir.mkdir()

lockfile_data = {
"jobs": {
"run": {
"steps": [
{"name": "Checkout", "run": "echo hi"},
]
}
}
}
(input_dir / "gh-aw-empty.lock.yml").write_text(
yaml.safe_dump(lockfile_data), encoding="utf-8"
)

result = _run_script(input_dir, output_dir)

assert result.returncode == 0
assert not (output_dir / "empty.prompt.md").exists()
manifest = (output_dir / "README.md").read_text(encoding="utf-8")
assert "Extracted prompt text from 0 lockfiles" in manifest