From aab53a2d09130a44be554fcd749fb1a8914c645f Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Wed, 17 Jun 2026 14:08:39 -0400 Subject: [PATCH] fix: scope external-update concurrency per component The external-update workflow shared orchestrate's ref-scoped concurrency group (orchestrate-${{ github.ref }}). When two upstream components notify the same downstream repo on the same ref, they landed in one group. GitHub keeps only the latest pending run per group and cancels the older pending ones even with cancel-in-progress: false, so one component's state write was silently dropped. Scope the default group per component and per ref (cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}). Distinct components now run concurrently; a component still serializes against itself. Cross-writer manifest contention is resolved by commitWithApplicationRetry's fetch/reset/re-apply on a non-fast-forward push. Explicit concurrency.group overrides are still forwarded verbatim, and the state-token checkout and stderr-capture behavior are unchanged. Signed-off-by: Joshua Temple --- internal/generate/external.go | 44 ++++++++++++---- internal/generate/external_test.go | 85 +++++++++++++++++++++++------- 2 files changed, 100 insertions(+), 29 deletions(-) diff --git a/internal/generate/external.go b/internal/generate/external.go index d0dfedd..f96468f 100644 --- a/internal/generate/external.go +++ b/internal/generate/external.go @@ -177,20 +177,42 @@ func (g *ExternalUpdateGenerator) writeJob(sb *strings.Builder) { } // writeConcurrency emits a top-level concurrency: block on the external-update -// workflow. The default group is the same ref-scoped key orchestrate uses -// ("orchestrate-${{ github.ref }}"), so external and internal state writes -// serialize on a shared non-cancelling queue and never race on the manifest -// push. cancel-in-progress is always false by default: dropping a mid-flight -// write leaves the manifest inconsistent, and a live orchestrate pipeline must -// never be cancelled by an incoming external notification. +// workflow. The default group is scoped PER COMPONENT and per ref +// ("cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}"). Each +// upstream component (distinct deploy_name) lands in its own queue, so two +// components notifying the same downstream repo at the same time both run to +// completion instead of one cancelling the other. A given component still +// serializes against itself: GitHub keeps only the latest pending run per +// group, so a burst from one component collapses to the newest, which is the +// intended last-writer-wins for that single slot. // -// When the manifest explicitly sets concurrency.group, that value is forwarded -// as-is (via GetConcurrencyGroup) so operators can override the group if needed. -// cancel-in-progress follows the explicit config value when set, and defaults to -// false otherwise. +// Cross-component contention on the shared manifest file, and contention with +// the orchestrate state-writer, is resolved by commitWithApplicationRetry in +// the external update verb (fetch / reset / re-apply on a non-fast-forward +// push), not by a shared concurrency group. A single shared group would +// serialize distinct components and let GitHub's "keep only the latest pending +// run" rule drop all but one component's write, which is the regression this +// per-component key fixes. +// +// inputs.deploy_name is a workflow-level expression here, not shell text, so it +// is inert in the concurrency key and carries none of the run-body injection +// risk that the env-bound inputs guard against. +// +// cancel-in-progress is always false by default: dropping a mid-flight write +// leaves the manifest inconsistent. When the manifest explicitly sets +// concurrency.group, that value is forwarded as-is (via GetConcurrencyGroup) +// so operators can override the group, and cancel-in-progress follows the +// explicit config value. func (g *ExternalUpdateGenerator) writeConcurrency(sb *strings.Builder) { sb.WriteString("concurrency:\n") - fmt.Fprintf(sb, " group: %s\n", g.config.GetConcurrencyGroup()) + if g.config.Concurrency != nil && g.config.Concurrency.Group != "" { + // Operator override: forward the explicit group verbatim. + fmt.Fprintf(sb, " group: %s\n", g.config.Concurrency.Group) + } else { + // Default: per-component, per-ref group so distinct upstream components + // run concurrently against the same downstream repo. + sb.WriteString(" group: cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}\n") + } if g.config.Concurrency != nil { fmt.Fprintf(sb, " cancel-in-progress: %t\n", g.config.Concurrency.CancelInProgress) } else { diff --git a/internal/generate/external_test.go b/internal/generate/external_test.go index f06de73..272c19f 100644 --- a/internal/generate/external_test.go +++ b/internal/generate/external_test.go @@ -563,11 +563,15 @@ func TestExternalUpdateGenerator_NotPrimary(t *testing.T) { } // TestExternalUpdateGenerator_HasConcurrencyBlock asserts the generated -// external-update workflow declares a top-level concurrency: block that shares -// orchestrate's ref-scoped group ("orchestrate-${{ github.ref }}"). This -// serializes external and internal state writes on the same non-cancelling -// queue so they never race on the manifest push (#31). cancel-in-progress is -// false because a dropped mid-flight write leaves the manifest inconsistent. +// external-update workflow declares a top-level concurrency: block scoped per +// component and per ref ("cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}"). +// Distinct upstream components land in distinct groups so both run when they +// notify the same downstream repo at the same time; a component still +// serializes against itself. The group must NOT be orchestrate's shared +// ref-scoped key, which would let GitHub cancel all-but-the-latest pending run +// and silently drop a component's state write. cancel-in-progress is false +// because a dropped mid-flight write leaves the manifest inconsistent; +// cross-writer manifest contention is handled by commitWithApplicationRetry. func TestExternalUpdateGenerator_HasConcurrencyBlock(t *testing.T) { cfg := &config.TrunkConfig{ TrunkBranch: "master", @@ -589,13 +593,54 @@ func TestExternalUpdateGenerator_HasConcurrencyBlock(t *testing.T) { assert.Contains(t, content, "\nconcurrency:\n", "external-update workflow must declare top-level concurrency:") group := concurrencyGroupLine(t, content) - assert.Equal(t, " group: orchestrate-${{ github.ref }}", group, - "external-update default concurrency group must share orchestrate's ref-scoped queue to serialize state writes") - assert.NotContains(t, group, "inputs.source_repo", "external-update group must NOT scope by source_repo: all runs push the same manifest") - assert.NotContains(t, group, "inputs.environment", "external-update group must NOT scope by environment: all runs push the same manifest") + assert.Equal(t, " group: cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}", group, + "external-update default concurrency group must be scoped per component so distinct components run concurrently") + assert.NotContains(t, group, "orchestrate-${{ github.ref }}", + "external-update group must NOT share orchestrate's shared group: that cancels concurrent component updates") + assert.Contains(t, group, "inputs.deploy_name", "external-update group must scope by deploy_name so distinct components do not collide") + assert.NotContains(t, group, "inputs.source_repo", "external-update group must NOT scope by source_repo") + assert.NotContains(t, group, "inputs.environment", "external-update group must NOT scope by environment") assert.Contains(t, content, "cancel-in-progress: false", "external-update default must queue, not cancel") } +// TestExternalUpdateGenerator_DistinctComponentsGetDistinctGroups asserts that +// two different components (distinct deploy_name) end up in DIFFERENT +// concurrency groups, so artifact-a and artifact-b both run to completion when +// they notify the same downstream repo concurrently. The group key is the same +// expression text for every run; what differs at runtime is the deploy_name +// input GitHub substitutes. We assert the key carries inputs.deploy_name (and +// is not the shared orchestrate group) because that is what makes the runtime +// groups distinct. +func TestExternalUpdateGenerator_DistinctComponentsGetDistinctGroups(t *testing.T) { + cfg := &config.TrunkConfig{ + TrunkBranch: "main", + Environments: []string{"dev", "prod"}, + External: []config.ExternalRepoConfig{ + { + Repo: "example/cdk-infra", + Ref: "main", + Deploys: []config.ExternalDeployConfig{ + {Name: "artifact-a", Workflow: "example/cdk-infra/.github/workflows/deploy.yaml"}, + {Name: "artifact-b", Workflow: "example/cdk-infra/.github/workflows/deploy.yaml"}, + }, + }, + }, + } + + gen := NewExternalUpdateGenerator(cfg, "/tmp") + content, err := gen.Generate() + require.NoError(t, err) + + group := concurrencyGroupLine(t, content) + // The generated key parameterizes on inputs.deploy_name, so at runtime + // artifact-a -> cascade-external-artifact-a- and + // artifact-b -> cascade-external-artifact-b-: distinct groups, both run. + assert.Equal(t, " group: cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}", group, + "the concurrency group must parameterize on inputs.deploy_name so distinct components resolve to distinct groups") + assert.NotEqual(t, " group: orchestrate-${{ github.ref }}", group, + "the shared orchestrate group would put all components in one queue and cancel all but the latest pending run") +} + // TestExternalUpdateGenerator_InputsAreNotInterpolatedIntoRun asserts that the // generated "Update External State" step never substitutes workflow_dispatch // inputs directly into the shell script text. GitHub Actions expands ${{ ... }} @@ -743,12 +788,14 @@ func TestExternalUpdateGenerator_CheckoutUsesStateToken(t *testing.T) { }) } -// TestExternalUpdateGenerator_DefaultConcurrencySharesOrchestrateGroup asserts -// that, with no explicit concurrency config, the external-update workflow uses -// the same ref-scoped group as orchestrate ("orchestrate-${{ github.ref }}") -// so external and internal state writes serialize on a shared non-cancelling -// queue. cancel-in-progress must remain false to never drop a live pipeline. -func TestExternalUpdateGenerator_DefaultConcurrencySharesOrchestrateGroup(t *testing.T) { +// TestExternalUpdateGenerator_DefaultConcurrencyIsPerComponent asserts that, +// with no explicit concurrency config, the external-update workflow uses a +// per-component, per-ref group ("cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}") +// rather than orchestrate's shared group. The per-component key lets distinct +// upstream components run concurrently against the same downstream repo; +// commitWithApplicationRetry serializes the actual manifest push. +// cancel-in-progress must remain false to never drop a live state write. +func TestExternalUpdateGenerator_DefaultConcurrencyIsPerComponent(t *testing.T) { cfg := &config.TrunkConfig{ TrunkBranch: "main", Environments: []string{"dev", "prod"}, @@ -769,10 +816,12 @@ func TestExternalUpdateGenerator_DefaultConcurrencySharesOrchestrateGroup(t *tes assert.Contains(t, content, "\nconcurrency:\n", "external-update workflow must declare top-level concurrency:") group := concurrencyGroupLine(t, content) - assert.Equal(t, " group: orchestrate-${{ github.ref }}", group, - "external-update default concurrency group must share orchestrate's ref-scoped group to serialize state writes") + assert.Equal(t, " group: cascade-external-${{ inputs.deploy_name }}-${{ github.ref }}", group, + "external-update default concurrency group must be per-component so distinct components do not cancel each other") + assert.NotEqual(t, " group: orchestrate-${{ github.ref }}", group, + "external-update must NOT reuse orchestrate's shared group: that is the cancellation regression") assert.Contains(t, content, "cancel-in-progress: false", - "external-update must never cancel a live pipeline; cancel-in-progress must be false") + "external-update must never cancel a live state write; cancel-in-progress must be false") } // TestNotifyPrimaryStep_BuildOnlySatellite_EmitsDeployName verifies that a