From 9fb92c73bc47505fc81995513b33cb576c7b5c04 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 2 Jun 2026 14:49:28 +0000
Subject: [PATCH] Fix lockfile prompt extraction for current compiled workflows

Replace brittle awk parsing with YAML-based extraction that supports RUNNER_TEMP prompt includes and hashed GH_AW_PROMPT_*_EOF heredoc markers. Adds regression tests for current lockfile format and legacy markers.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 scripts/extract-lockfile-prompts.sh    | 104 +----------------
 scripts/extract_lockfile_prompts.py    | 152 +++++++++++++++++++++++++
 tests/test_extract_lockfile_prompts.py | 105 +++++++++++++++++
 3 files changed, 261 insertions(+), 100 deletions(-)
 create mode 100644 scripts/extract_lockfile_prompts.py
 create mode 100644 tests/test_extract_lockfile_prompts.py

diff --git a/scripts/extract-lockfile-prompts.sh b/scripts/extract-lockfile-prompts.sh
index ff50b331..d911401c 100755
--- a/scripts/extract-lockfile-prompts.sh
+++ b/scripts/extract-lockfile-prompts.sh
@@ -5,106 +5,10 @@
 #   input-dir:  directory containing .lock.yml files (default: .github/workflows)
 #   output-dir: where to write extracted .prompt.md files (default: /tmp/prompt-audit)
 #
-# Each lockfile's prompt is assembled from heredoc blocks (our content) and
-# cat "/opt/gh-aw/prompts/*.md" runtime includes (platform content). This script
-# extracts the heredoc content and marks runtime includes as placeholders.
+# Wrapper around the Python extractor. Keep this shell entrypoint because
+# workflows call this script directly.
 
 set -euo pipefail
 
-INPUT_DIR="${1:-.github/workflows}"
-OUTPUT_DIR="${2:-/tmp/prompt-audit}"
-
-mkdir -p "$OUTPUT_DIR"
-
-count=0
-for lockfile in "$INPUT_DIR"/gh-aw-*.lock.yml; do
-  [ -f "$lockfile" ] || continue
-
-  # Derive workflow name from filename: gh-aw-foo-bar.lock.yml → foo-bar
-  basename=$(basename "$lockfile")
-  name="${basename#gh-aw-}"
-  name="${name%.lock.yml}"
-
-  outfile="$OUTPUT_DIR/${name}.prompt.md"
-
-  # State machine to extract prompt content from the "Create prompt" step.
-  # States: 0=scanning, 1=in prompt block, 2=in heredoc content
-  awk '
-    BEGIN { state = 0 }
-
-    # Find the opening brace of the prompt assembly block
-    state == 0 && /^[ \t]*\{$/ && saw_create_prompt {
-      state = 1
-      next
-    }
-
-    # Track that we have seen the "Create prompt" step
-    /Create prompt with built-in context/ {
-      saw_create_prompt = 1
-      next
-    }
-
-    # End of prompt block
-    state >= 1 && /\} > "\$GH_AW_PROMPT"/ {
-      exit
-    }
-
-    # Runtime file include → placeholder
-    state == 1 && /cat "\/opt\/gh-aw\/prompts\// {
-      # Extract filename: cat "/opt/gh-aw/prompts/foo.md" → foo.md
-      s = $0
-      sub(/.*cat "\/opt\/gh-aw\/prompts\//, "", s)
-      sub(/".*/, "", s)
-      if (s != "") {
-        print "<!-- [RUNTIME INCLUDE: " s "] -->"
-        print ""
-      }
-      next
-    }
-
-    # Start of heredoc block
-    state == 1 && /cat << .GH_AW_PROMPT_EOF./ {
-      state = 2
-      next
-    }
-
-    # End of heredoc block
-    state == 2 && /^[ \t]*GH_AW_PROMPT_EOF[ \t]*$/ {
-      state = 1
-      next
-    }
-
-    # Content inside heredoc — strip leading whitespace (lockfile indents with 10 spaces)
-    state == 2 {
-      sub(/^          /, "")
-      print
-    }
-  ' "$lockfile" > "$outfile"
-
-  # Skip empty extractions (backwards-compat wrapper files, etc.)
-  if [ ! -s "$outfile" ]; then
-    rm -f "$outfile"
-    continue
-  fi
-
-  count=$((count + 1))
-done
-
-# Write a manifest listing all extracted files with line counts
-{
-  echo "# Prompt Audit Manifest"
-  echo ""
-  echo "Extracted prompt text from $count lockfiles in \`$INPUT_DIR/\`."
-  echo ""
-  echo "| Workflow | Lines | File |"
-  echo "| --- | --- | --- |"
-  for f in "$OUTPUT_DIR"/*.prompt.md; do
-    [ -f "$f" ] || continue
-    base=$(basename "$f" .prompt.md)
-    lines=$(wc -l < "$f" | tr -d ' ')
-    echo "| $base | $lines | \`$f\` |"
-  done
-} > "$OUTPUT_DIR/README.md"
-
-echo "Extracted prompts from $count lockfiles → $OUTPUT_DIR/"
-ls -la "$OUTPUT_DIR/"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec python3 "$SCRIPT_DIR/extract_lockfile_prompts.py" "$@"
diff --git a/scripts/extract_lockfile_prompts.py b/scripts/extract_lockfile_prompts.py
new file mode 100644
index 00000000..dd0353c1
--- /dev/null
+++ b/scripts/extract_lockfile_prompts.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""Extract compiled prompts from gh-aw lockfiles using YAML parsing."""
+
+from __future__ import annotations
+
+import argparse
+import sys
+from pathlib import Path
+import re
+
+import yaml
+
+
+CREATE_PROMPT_STEP_NAME = "Create prompt with built-in context"
+RUNTIME_INCLUDE_RE = re.compile(
+    r'^\s*cat\s+["\']?(?P<path>[^"\']*/gh-aw/prompts/[^"\']+)["\']?\s*$'
+)
+HEREDOC_START_RE = re.compile(
+    r'^\s*cat\s*<<-?\s*["\']?(?P<marker>[A-Za-z0-9_]+)["\']?\s*$'
+)
+
+
+def extract_prompts_from_run(run_script: str) -> str:
+    """Extract heredoc content and runtime include placeholders from a run script."""
+    lines: list[str] = []
+    heredoc_end_marker: str | None = None
+
+    for line in run_script.splitlines():
+        if heredoc_end_marker is not None:
+            if line.strip() == heredoc_end_marker:
+                heredoc_end_marker = None
+                continue
+            lines.append(line)
+            continue
+
+        include_match = RUNTIME_INCLUDE_RE.match(line)
+        if include_match:
+            include_name = Path(include_match.group("path")).name
+            lines.append(f"<!-- [RUNTIME INCLUDE: {include_name}] -->")
+            lines.append("")
+            continue
+
+        heredoc_match = HEREDOC_START_RE.match(line)
+        if heredoc_match:
+            heredoc_end_marker = heredoc_match.group("marker")
+            continue
+
+    extracted = "\n".join(lines).rstrip()
+    if not extracted:
+        return ""
+    return f"{extracted}\n"
+
+
+def extract_lockfile_prompt(lockfile_path: Path) -> str:
+    """Extract prompt content from a single lockfile's Create prompt step."""
+    with lockfile_path.open("r", encoding="utf-8") as lockfile:
+        data = yaml.safe_load(lockfile)
+
+    if not isinstance(data, dict):
+        return ""
+
+    jobs = data.get("jobs")
+    if not isinstance(jobs, dict):
+        return ""
+
+    for job in jobs.values():
+        if not isinstance(job, dict):
+            continue
+        steps = job.get("steps")
+        if not isinstance(steps, list):
+            continue
+        for step in steps:
+            if not isinstance(step, dict):
+                continue
+            step_name = step.get("name")
+            if not isinstance(step_name, str):
+                continue
+            if CREATE_PROMPT_STEP_NAME not in step_name:
+                continue
+            run_script = step.get("run")
+            if not isinstance(run_script, str):
+                continue
+            return extract_prompts_from_run(run_script)
+
+    return ""
+
+
+def write_manifest(output_dir: Path, input_dir: Path, extracted_files: list[Path]) -> None:
+    lines = [
+        "# Prompt Audit Manifest",
+        "",
+        f"Extracted prompt text from {len(extracted_files)} lockfiles in `{input_dir}/`.",
+        "",
+        "| Workflow | Lines | File |",
+        "| --- | --- | --- |",
+    ]
+
+    for prompt_file in extracted_files:
+        workflow_name = prompt_file.name.removesuffix(".prompt.md")
+        line_count = len(prompt_file.read_text(encoding="utf-8").splitlines())
+        lines.append(f"| {workflow_name} | {line_count} | `{prompt_file}` |")
+
+    (output_dir / "README.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Extract agent prompt text from gh-aw .lock.yml files."
+    )
+    parser.add_argument(
+        "input_dir",
+        nargs="?",
+        default=".github/workflows",
+        help="Directory containing gh-aw-*.lock.yml files.",
+    )
+    parser.add_argument(
+        "output_dir",
+        nargs="?",
+        default="/tmp/prompt-audit",
+        help="Directory where extracted *.prompt.md files will be written.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str]) -> int:
+    args = parse_args(argv)
+    input_dir = Path(args.input_dir)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    extracted_files: list[Path] = []
+    for lockfile_path in sorted(input_dir.glob("gh-aw-*.lock.yml")):
+        workflow_name = lockfile_path.name.removeprefix("gh-aw-").removesuffix(".lock.yml")
+        output_file = output_dir / f"{workflow_name}.prompt.md"
+        prompt_text = extract_lockfile_prompt(lockfile_path)
+
+        if prompt_text:
+            output_file.write_text(prompt_text, encoding="utf-8")
+            extracted_files.append(output_file)
+        elif output_file.exists():
+            output_file.unlink()
+
+    write_manifest(output_dir, input_dir, extracted_files)
+
+    print(f"Extracted prompts from {len(extracted_files)} lockfiles -> {output_dir}/")
+    for path in sorted(output_dir.glob("*")):
+        print(path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/tests/test_extract_lockfile_prompts.py b/tests/test_extract_lockfile_prompts.py
new file mode 100644
index 00000000..73e0425f
--- /dev/null
+++ b/tests/test_extract_lockfile_prompts.py
@@ -0,0 +1,105 @@
+import importlib.util
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+
+
+SCRIPT_PATH = (
+    Path(__file__).resolve().parent.parent / "scripts" / "extract_lockfile_prompts.py"
+)
+CURRENT_LOCKFILE = (
+    Path(__file__).resolve().parent.parent
+    / ".github"
+    / "workflows"
+    / "gh-aw-framework-best-practices.lock.yml"
+)
+
+
+def _load_module():
+    spec = importlib.util.spec_from_file_location("extract_lockfile_prompts", SCRIPT_PATH)
+    assert spec and spec.loader
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _run_script(input_dir: Path, output_dir: Path) -> subprocess.CompletedProcess[str]:
+    return subprocess.run(
+        [sys.executable, str(SCRIPT_PATH), str(input_dir), str(output_dir)],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+
+def test_extract_prompts_from_run_supports_legacy_markers():
+    module = _load_module()
+    run_script = """
+{
+cat "/opt/gh-aw/prompts/base.md"
+cat << 'GH_AW_PROMPT_EOF'
+hello
+world
+GH_AW_PROMPT_EOF
+} > "$GH_AW_PROMPT"
+""".strip()
+
+    extracted = module.extract_prompts_from_run(run_script)
+
+    assert "<!-- [RUNTIME INCLUDE: base.md] -->" in extracted
+    assert "hello" in extracted
+    assert "world" in extracted
+
+
+def test_cli_extracts_current_lockfile_prompt_block(tmp_path):
+    input_dir = tmp_path / "input"
+    output_dir = tmp_path / "output"
+    input_dir.mkdir()
+    output_dir.mkdir()
+
+    lockfile_copy = input_dir / CURRENT_LOCKFILE.name
+    lockfile_copy.write_text(CURRENT_LOCKFILE.read_text(encoding="utf-8"), encoding="utf-8")
+
+    result = _run_script(input_dir, output_dir)
+
+    assert result.returncode == 0
+    extracted_file = output_dir / "framework-best-practices.prompt.md"
+    assert extracted_file.exists()
+
+    extracted = extracted_file.read_text(encoding="utf-8")
+    assert "<!-- [RUNTIME INCLUDE: xpia.md] -->" in extracted
+    assert "<safe-output-tools>" in extracted
+    assert "</system>" in extracted
+
+    manifest = (output_dir / "README.md").read_text(encoding="utf-8")
+    assert "Extracted prompt text from 1 lockfiles" in manifest
+    assert "framework-best-practices" in manifest
+
+
+def test_cli_skips_lockfile_without_create_prompt_step(tmp_path):
+    input_dir = tmp_path / "input"
+    output_dir = tmp_path / "output"
+    input_dir.mkdir()
+    output_dir.mkdir()
+
+    lockfile_data = {
+        "jobs": {
+            "run": {
+                "steps": [
+                    {"name": "Checkout", "run": "echo hi"},
+                ]
+            }
+        }
+    }
+    (input_dir / "gh-aw-empty.lock.yml").write_text(
+        yaml.safe_dump(lockfile_data), encoding="utf-8"
+    )
+
+    result = _run_script(input_dir, output_dir)
+
+    assert result.returncode == 0
+    assert not (output_dir / "empty.prompt.md").exists()
+    manifest = (output_dir / "README.md").read_text(encoding="utf-8")
+    assert "Extracted prompt text from 0 lockfiles" in manifest