From 9fb92c73bc47505fc81995513b33cb576c7b5c04 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 2 Jun 2026 14:49:28 +0000 Subject: [PATCH] Fix lockfile prompt extraction for current compiled workflows Replace brittle awk parsing with YAML-based extraction that supports RUNNER_TEMP prompt includes and hashed GH_AW_PROMPT_*_EOF heredoc markers. Adds regression tests for current lockfile format and legacy markers. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- scripts/extract-lockfile-prompts.sh | 104 +---------------- scripts/extract_lockfile_prompts.py | 152 +++++++++++++++++++++++++ tests/test_extract_lockfile_prompts.py | 105 +++++++++++++++++ 3 files changed, 261 insertions(+), 100 deletions(-) create mode 100644 scripts/extract_lockfile_prompts.py create mode 100644 tests/test_extract_lockfile_prompts.py diff --git a/scripts/extract-lockfile-prompts.sh b/scripts/extract-lockfile-prompts.sh index ff50b331..d911401c 100755 --- a/scripts/extract-lockfile-prompts.sh +++ b/scripts/extract-lockfile-prompts.sh @@ -5,106 +5,10 @@ # input-dir: directory containing .lock.yml files (default: .github/workflows) # output-dir: where to write extracted .prompt.md files (default: /tmp/prompt-audit) # -# Each lockfile's prompt is assembled from heredoc blocks (our content) and -# cat "/opt/gh-aw/prompts/*.md" runtime includes (platform content). This script -# extracts the heredoc content and marks runtime includes as placeholders. +# Wrapper around the Python extractor. Keep this shell entrypoint because +# workflows call this script directly. set -euo pipefail -INPUT_DIR="${1:-.github/workflows}" -OUTPUT_DIR="${2:-/tmp/prompt-audit}" - -mkdir -p "$OUTPUT_DIR" - -count=0 -for lockfile in "$INPUT_DIR"/gh-aw-*.lock.yml; do - [ -f "$lockfile" ] || continue - - # Derive workflow name from filename: gh-aw-foo-bar.lock.yml → foo-bar - basename=$(basename "$lockfile") - name="${basename#gh-aw-}" - name="${name%.lock.yml}" - - outfile="$OUTPUT_DIR/${name}.prompt.md" - - # State machine to extract prompt content from the "Create prompt" step. - # States: 0=scanning, 1=in prompt block, 2=in heredoc content - awk ' - BEGIN { state = 0 } - - # Find the opening brace of the prompt assembly block - state == 0 && /^[ \t]*\{$/ && saw_create_prompt { - state = 1 - next - } - - # Track that we have seen the "Create prompt" step - /Create prompt with built-in context/ { - saw_create_prompt = 1 - next - } - - # End of prompt block - state >= 1 && /\} > "\$GH_AW_PROMPT"/ { - exit - } - - # Runtime file include → placeholder - state == 1 && /cat "\/opt\/gh-aw\/prompts\// { - # Extract filename: cat "/opt/gh-aw/prompts/foo.md" → foo.md - s = $0 - sub(/.*cat "\/opt\/gh-aw\/prompts\//, "", s) - sub(/".*/, "", s) - if (s != "") { - print "" - print "" - } - next - } - - # Start of heredoc block - state == 1 && /cat << .GH_AW_PROMPT_EOF./ { - state = 2 - next - } - - # End of heredoc block - state == 2 && /^[ \t]*GH_AW_PROMPT_EOF[ \t]*$/ { - state = 1 - next - } - - # Content inside heredoc — strip leading whitespace (lockfile indents with 10 spaces) - state == 2 { - sub(/^ /, "") - print - } - ' "$lockfile" > "$outfile" - - # Skip empty extractions (backwards-compat wrapper files, etc.) - if [ ! -s "$outfile" ]; then - rm -f "$outfile" - continue - fi - - count=$((count + 1)) -done - -# Write a manifest listing all extracted files with line counts -{ - echo "# Prompt Audit Manifest" - echo "" - echo "Extracted prompt text from $count lockfiles in \`$INPUT_DIR/\`." - echo "" - echo "| Workflow | Lines | File |" - echo "| --- | --- | --- |" - for f in "$OUTPUT_DIR"/*.prompt.md; do - [ -f "$f" ] || continue - base=$(basename "$f" .prompt.md) - lines=$(wc -l < "$f" | tr -d ' ') - echo "| $base | $lines | \`$f\` |" - done -} > "$OUTPUT_DIR/README.md" - -echo "Extracted prompts from $count lockfiles → $OUTPUT_DIR/" -ls -la "$OUTPUT_DIR/" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +exec python3 "$SCRIPT_DIR/extract_lockfile_prompts.py" "$@" diff --git a/scripts/extract_lockfile_prompts.py b/scripts/extract_lockfile_prompts.py new file mode 100644 index 00000000..dd0353c1 --- /dev/null +++ b/scripts/extract_lockfile_prompts.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""Extract compiled prompts from gh-aw lockfiles using YAML parsing.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +import re + +import yaml + + +CREATE_PROMPT_STEP_NAME = "Create prompt with built-in context" +RUNTIME_INCLUDE_RE = re.compile( + r'^\s*cat\s+["\']?(?P[^"\']*/gh-aw/prompts/[^"\']+)["\']?\s*$' +) +HEREDOC_START_RE = re.compile( + r'^\s*cat\s*<<-?\s*["\']?(?P[A-Za-z0-9_]+)["\']?\s*$' +) + + +def extract_prompts_from_run(run_script: str) -> str: + """Extract heredoc content and runtime include placeholders from a run script.""" + lines: list[str] = [] + heredoc_end_marker: str | None = None + + for line in run_script.splitlines(): + if heredoc_end_marker is not None: + if line.strip() == heredoc_end_marker: + heredoc_end_marker = None + continue + lines.append(line) + continue + + include_match = RUNTIME_INCLUDE_RE.match(line) + if include_match: + include_name = Path(include_match.group("path")).name + lines.append(f"") + lines.append("") + continue + + heredoc_match = HEREDOC_START_RE.match(line) + if heredoc_match: + heredoc_end_marker = heredoc_match.group("marker") + continue + + extracted = "\n".join(lines).rstrip() + if not extracted: + return "" + return f"{extracted}\n" + + +def extract_lockfile_prompt(lockfile_path: Path) -> str: + """Extract prompt content from a single lockfile's Create prompt step.""" + with lockfile_path.open("r", encoding="utf-8") as lockfile: + data = yaml.safe_load(lockfile) + + if not isinstance(data, dict): + return "" + + jobs = data.get("jobs") + if not isinstance(jobs, dict): + return "" + + for job in jobs.values(): + if not isinstance(job, dict): + continue + steps = job.get("steps") + if not isinstance(steps, list): + continue + for step in steps: + if not isinstance(step, dict): + continue + step_name = step.get("name") + if not isinstance(step_name, str): + continue + if CREATE_PROMPT_STEP_NAME not in step_name: + continue + run_script = step.get("run") + if not isinstance(run_script, str): + continue + return extract_prompts_from_run(run_script) + + return "" + + +def write_manifest(output_dir: Path, input_dir: Path, extracted_files: list[Path]) -> None: + lines = [ + "# Prompt Audit Manifest", + "", + f"Extracted prompt text from {len(extracted_files)} lockfiles in `{input_dir}/`.", + "", + "| Workflow | Lines | File |", + "| --- | --- | --- |", + ] + + for prompt_file in extracted_files: + workflow_name = prompt_file.name.removesuffix(".prompt.md") + line_count = len(prompt_file.read_text(encoding="utf-8").splitlines()) + lines.append(f"| {workflow_name} | {line_count} | `{prompt_file}` |") + + (output_dir / "README.md").write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def parse_args(argv: list[str]) -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Extract agent prompt text from gh-aw .lock.yml files." + ) + parser.add_argument( + "input_dir", + nargs="?", + default=".github/workflows", + help="Directory containing gh-aw-*.lock.yml files.", + ) + parser.add_argument( + "output_dir", + nargs="?", + default="/tmp/prompt-audit", + help="Directory where extracted *.prompt.md files will be written.", + ) + return parser.parse_args(argv) + + +def main(argv: list[str]) -> int: + args = parse_args(argv) + input_dir = Path(args.input_dir) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + extracted_files: list[Path] = [] + for lockfile_path in sorted(input_dir.glob("gh-aw-*.lock.yml")): + workflow_name = lockfile_path.name.removeprefix("gh-aw-").removesuffix(".lock.yml") + output_file = output_dir / f"{workflow_name}.prompt.md" + prompt_text = extract_lockfile_prompt(lockfile_path) + + if prompt_text: + output_file.write_text(prompt_text, encoding="utf-8") + extracted_files.append(output_file) + elif output_file.exists(): + output_file.unlink() + + write_manifest(output_dir, input_dir, extracted_files) + + print(f"Extracted prompts from {len(extracted_files)} lockfiles -> {output_dir}/") + for path in sorted(output_dir.glob("*")): + print(path) + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/tests/test_extract_lockfile_prompts.py b/tests/test_extract_lockfile_prompts.py new file mode 100644 index 00000000..73e0425f --- /dev/null +++ b/tests/test_extract_lockfile_prompts.py @@ -0,0 +1,105 @@ +import importlib.util +import subprocess +import sys +from pathlib import Path + +import yaml + + +SCRIPT_PATH = ( + Path(__file__).resolve().parent.parent / "scripts" / "extract_lockfile_prompts.py" +) +CURRENT_LOCKFILE = ( + Path(__file__).resolve().parent.parent + / ".github" + / "workflows" + / "gh-aw-framework-best-practices.lock.yml" +) + + +def _load_module(): + spec = importlib.util.spec_from_file_location("extract_lockfile_prompts", SCRIPT_PATH) + assert spec and spec.loader + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _run_script(input_dir: Path, output_dir: Path) -> subprocess.CompletedProcess[str]: + return subprocess.run( + [sys.executable, str(SCRIPT_PATH), str(input_dir), str(output_dir)], + capture_output=True, + text=True, + check=False, + ) + + +def test_extract_prompts_from_run_supports_legacy_markers(): + module = _load_module() + run_script = """ +{ +cat "/opt/gh-aw/prompts/base.md" +cat << 'GH_AW_PROMPT_EOF' +hello +world +GH_AW_PROMPT_EOF +} > "$GH_AW_PROMPT" +""".strip() + + extracted = module.extract_prompts_from_run(run_script) + + assert "" in extracted + assert "hello" in extracted + assert "world" in extracted + + +def test_cli_extracts_current_lockfile_prompt_block(tmp_path): + input_dir = tmp_path / "input" + output_dir = tmp_path / "output" + input_dir.mkdir() + output_dir.mkdir() + + lockfile_copy = input_dir / CURRENT_LOCKFILE.name + lockfile_copy.write_text(CURRENT_LOCKFILE.read_text(encoding="utf-8"), encoding="utf-8") + + result = _run_script(input_dir, output_dir) + + assert result.returncode == 0 + extracted_file = output_dir / "framework-best-practices.prompt.md" + assert extracted_file.exists() + + extracted = extracted_file.read_text(encoding="utf-8") + assert "" in extracted + assert "" in extracted + assert "" in extracted + + manifest = (output_dir / "README.md").read_text(encoding="utf-8") + assert "Extracted prompt text from 1 lockfiles" in manifest + assert "framework-best-practices" in manifest + + +def test_cli_skips_lockfile_without_create_prompt_step(tmp_path): + input_dir = tmp_path / "input" + output_dir = tmp_path / "output" + input_dir.mkdir() + output_dir.mkdir() + + lockfile_data = { + "jobs": { + "run": { + "steps": [ + {"name": "Checkout", "run": "echo hi"}, + ] + } + } + } + (input_dir / "gh-aw-empty.lock.yml").write_text( + yaml.safe_dump(lockfile_data), encoding="utf-8" + ) + + result = _run_script(input_dir, output_dir) + + assert result.returncode == 0 + assert not (output_dir / "empty.prompt.md").exists() + manifest = (output_dir / "README.md").read_text(encoding="utf-8") + assert "Extracted prompt text from 0 lockfiles" in manifest