From 09b5d623a4aa8c5f7ca3208b5f18ff4e7839ae77 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Wed, 17 Jun 2026 10:04:49 -0400 Subject: [PATCH] fix: author hotfix resolution PR with trigger-capable state token The hotfix apply job opened the resolution PR with gh pr create under the job-level GH_TOKEN, which defaulted to GITHUB_TOKEN. A PR authored by github-actions[bot] does not fire on: pull_request workflows, so on a protected env branch the required status check could only post via on: workflow_run after the hotfix run finished. The apply job would not finish until the PR merged, the PR could not merge until the check posted, and the check could not post until the apply job finished: a deadlock. Set the apply job's job-level GH_TOKEN to the configured state token so the resolution PR is authored by a trigger-capable actor. on: pull_request then fires and the env-branch required check posts on PR open, independent of the apply job, while the poll-until-mergeable merge step still gates the merge on that check. When no state token is configured the token degrades to GITHUB_TOKEN; post-hotfix automation then requires a configured state_token, consistent with the merge step. Signed-off-by: Joshua Temple --- internal/generate/hotfix.go | 36 ++++++++++++++------- internal/generate/hotfix_test.go | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/internal/generate/hotfix.go b/internal/generate/hotfix.go index 3f63b37..38c6f47 100644 --- a/internal/generate/hotfix.go +++ b/internal/generate/hotfix.go @@ -253,11 +253,14 @@ func (g *HotfixGenerator) writePlanJob(sb *strings.Builder) { } // writeApplyJob emits the apply job, run on dispatch when not a dry-run. It -// cherry-picks the commit onto a hotfix branch and opens a resolution PR. A -// clean cherry-pick is merged by the dedicated merge step as the configured -// state token, which polls until the PR is mergeable so a protected env branch -// with a required check still gates the merge. A conflicting cherry-pick opens a -// labeled PR for local resolution and is merged by a human via the UI. +// cherry-picks the commit onto a hotfix branch and opens a resolution PR via gh +// pr create. The job-level GH_TOKEN is the configured state token so the PR is +// authored by a trigger-capable actor: this fires on: pull_request, which lets a +// protected env branch's required check post on PR open rather than only after +// this run finishes. A clean cherry-pick is then merged by the dedicated merge +// step (also as the state token), which polls until the PR is mergeable so the +// required check still gates the merge. A conflicting cherry-pick opens a labeled +// PR for local resolution and is merged by a human via the UI. func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { sb.WriteString(" apply:\n") sb.WriteString(" name: Apply Hotfix Cherry-Pick\n") @@ -268,7 +271,18 @@ func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { sb.WriteString(" if: github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run != 'true' && needs.plan.outputs.no_op != 'true'\n") sb.WriteString(" runs-on: ubuntu-latest\n") sb.WriteString(" env:\n") - sb.WriteString(" GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}\n") + // Author the resolution PR with the configured state token so gh pr create + // runs as a trigger-capable actor. A PR opened under the default GITHUB_TOKEN + // is authored by github-actions[bot], and a bot-authored PR does not fire + // on: pull_request workflows; the env-branch required check would then post + // only via on: workflow_run after this run finishes, deadlocking against the + // merge step that waits for that check. A PAT-authored PR fires on: + // pull_request so the check posts on PR open, independent of this job. The + // merge step (writeCleanMergeStep) inherits this same job-level token. When + // no state token is configured this degrades to GITHUB_TOKEN, in which case + // post-hotfix automation (early check + finalize) requires the operator to + // supply a trigger-capable state_token, matching the merge step's caveat. + fmt.Fprintf(sb, " GH_TOKEN: %s\n", g.getStateTokenRef()) sb.WriteString(" COMMIT: ${{ github.event.inputs.commit }}\n") sb.WriteString(" TARGET_ENV: ${{ github.event.inputs.target_env }}\n") sb.WriteString(" BASE_SHA: ${{ needs.plan.outputs.base_sha }}\n") @@ -333,11 +347,11 @@ func (g *HotfixGenerator) writeApplyJob(sb *strings.Builder) { fmt.Fprintf(sb, " --label %s \\\n", hotfixLabel) sb.WriteString(" --title \"hotfix(${TARGET_ENV}): cherry-pick ${SHORT_SHA}\" \\\n") sb.WriteString(" --body \"$BODY\"\n") - // Hand the resolution branch to the dedicated merge step. The merge runs - // as the configured state token (a trigger-capable actor), which the - // job-level GH_TOKEN is not, so it has to be a separate step with its own - // env. The clean path is the only one that auto-merges; the conflict path - // leaves the merge to a human via the UI. + // Hand the resolution branch to the dedicated merge step. Both gh pr create + // above and the merge step run as the job-level GH_TOKEN (the configured + // state token), so the resolution PR is authored by a trigger-capable actor + // and the merge is too. The clean path is the only one that auto-merges; the + // conflict path leaves the merge to a human via the UI. sb.WriteString(" {\n") sb.WriteString(" echo \"HOTFIX_BRANCH=$BRANCH\"\n") sb.WriteString(" echo \"HOTFIX_CLEAN_MERGE=true\"\n") diff --git a/internal/generate/hotfix_test.go b/internal/generate/hotfix_test.go index d1fe6ae..4088d72 100644 --- a/internal/generate/hotfix_test.go +++ b/internal/generate/hotfix_test.go @@ -200,6 +200,61 @@ func TestHotfixGenerator_CleanPathMergeDefaultsToGitHubToken(t *testing.T) { assert.Contains(t, content, "GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}") } +// applyJobGHToken extracts the apply job's job-level GH_TOKEN expression from a +// generated hotfix workflow. It parses the workflow as YAML so the assertion +// targets the job-level env value rather than any step-level override, isolating +// the actor that authors the resolution PR via gh pr create. +func applyJobGHToken(t *testing.T, content string) string { + t.Helper() + var wf struct { + Jobs map[string]struct { + Env map[string]string `yaml:"env"` + } `yaml:"jobs"` + } + require.NoError(t, yaml.Unmarshal([]byte(content), &wf)) + apply, ok := wf.Jobs["apply"] + require.True(t, ok, "apply job must be present") + return apply.Env["GH_TOKEN"] +} + +// TestHotfixGenerator_ApplyCreatesPRWithStateToken guards the structural fix for +// the protected-env-branch deadlock. The apply job opens the resolution PR with +// gh pr create, which authenticates with the job-level GH_TOKEN. When that token +// is the default GITHUB_TOKEN the PR is authored by github-actions[bot], and a +// bot-authored PR does not trigger on: pull_request workflows. The env-branch +// required check then can only post via on: workflow_run after the hotfix run +// finishes, but the apply job will not finish until the PR merges, the PR cannot +// merge until the check posts, and the check cannot post until the apply job +// finishes: a deadlock. Authoring the PR with the trigger-capable state token +// fires on: pull_request so the required check posts on PR open, independent of +// the apply job, breaking the cycle. +func TestHotfixGenerator_ApplyCreatesPRWithStateToken(t *testing.T) { + cfg := threeEnvHotfixConfig() + cfg.StateToken = "${{ secrets.CASCADE_BOT_TOKEN }}" + gen := NewHotfixGenerator(cfg, "") + content, err := gen.Generate() + require.NoError(t, err) + + // The apply job's job-level GH_TOKEN, which gh pr create uses to author the + // resolution PR, must be the configured state token, not bare GITHUB_TOKEN. + assert.Equal(t, "${{ secrets.CASCADE_BOT_TOKEN }}", applyJobGHToken(t, content), + "the apply job must author the resolution PR with the trigger-capable state token so on: pull_request fires and the env-branch required check posts on PR open") +} + +// TestHotfixGenerator_ApplyTokenDefaultsToGitHubToken confirms back-compat: when +// no state token is configured the apply job's GH_TOKEN degrades to the default +// GITHUB_TOKEN expression, matching the token plumbing used elsewhere. Post-hotfix +// automation (the env-branch check firing on PR open and the finalize chain) +// requires a configured state_token, consistent with the merge step's caveat. +func TestHotfixGenerator_ApplyTokenDefaultsToGitHubToken(t *testing.T) { + gen := NewHotfixGenerator(threeEnvHotfixConfig(), "") + content, err := gen.Generate() + require.NoError(t, err) + + assert.Equal(t, "${{ secrets.GITHUB_TOKEN }}", applyJobGHToken(t, content), + "with no state token configured the apply job must fall back to GITHUB_TOKEN") +} + // TestHotfixGenerator_SeedsLabels guards the regression where the apply job ran // `gh pr create --label cascade-hotfix[-conflict]` without ever creating those // labels. `gh pr create --label X` hard-fails when label X does not exist, so