From 7d099e385de1bc0fc52bd5f4e58e2328fc898e86 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 11 Mar 2026 18:56:51 -0400 Subject: [PATCH] OCPCRT-455: Add splay configuration to stagger verification job start times Add a "splay" option to ReleaseConfig that spreads out verification job launches over a configurable random window. Each job gets a deterministic delay in [0, splay) based on an FNV32 hash of the payload tag and job name, reducing the blast radius of transient infrastructure issues like registry outages. The splay gate is applied in ensureProwJobForReleaseTag(), the single function through which all ProwJob creation flows. Aggregator jobs are held until all their analysis jobs have been created, preserving the existing ordering guarantee. The controller re-queues itself every 30s while jobs are still deferred. Example config: "splay": "15m" Co-Authored-By: Claude Opus 4.6 --- cmd/release-controller/splay_test.go | 64 ++++++++++++++++++++++ cmd/release-controller/sync_analysis.go | 15 +++-- cmd/release-controller/sync_upgrade.go | 2 + cmd/release-controller/sync_verify.go | 18 +++++- cmd/release-controller/sync_verify_prow.go | 20 +++++++ pkg/release-controller/types.go | 7 +++ 6 files changed, 121 insertions(+), 5 deletions(-) create mode 100644 cmd/release-controller/splay_test.go diff --git a/cmd/release-controller/splay_test.go b/cmd/release-controller/splay_test.go new file mode 100644 index 000000000..0a1687183 --- /dev/null +++ b/cmd/release-controller/splay_test.go @@ -0,0 +1,64 @@ +package main + +import ( + "hash/fnv" + "testing" + "time" +) + +// splayDelay mirrors the delay computation in ensureProwJobForReleaseTag +func splayDelay(tagName, prowJobName string, splay time.Duration) time.Duration { + h := fnv.New32a() + h.Write([]byte(tagName + "/" + prowJobName)) + return time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second +} + +func TestSplayDelay(t *testing.T) { + splay := 15 * time.Minute + + t.Run("deterministic", func(t *testing.T) { + d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay) + d2 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay) + if d1 != d2 { + t.Errorf("same inputs produced different delays: %v vs %v", d1, d2) + } + }) + + t.Run("within bounds", func(t *testing.T) { + for i := range 100 { + tag := "4.22.0-0.nightly-2026-03-11-124441" + job := "job-" + string(rune('a'+i)) + d := splayDelay(tag, job, splay) + if d < 0 || d >= splay { + t.Errorf("delay %v out of range [0, %v) for job %s", d, splay, job) + } + } + }) + + t.Run("different jobs get different delays", func(t *testing.T) { + tag := "4.22.0-0.nightly-2026-03-11-124441" + delays := make(map[time.Duration]int) + for i := range 30 { + job := "aggregated-e2e-aws-analysis-" + string(rune('0'+i)) + d := splayDelay(tag, job, splay) + delays[d]++ + } + // With 30 jobs across 900 seconds, we should get reasonable distribution. + // At minimum, not all delays should be identical. + if len(delays) < 2 { + t.Errorf("expected varied delays across 30 jobs, got %d distinct values", len(delays)) + } + }) + + t.Run("different payloads get different delays for same job", func(t *testing.T) { + d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "e2e-aws", splay) + d2 := splayDelay("4.22.0-0.nightly-2026-03-12-060000", "e2e-aws", splay) + if d1 == d2 { + t.Logf("note: same delay for different payloads (possible but unlikely): %v", d1) + } + }) + + // Note: when splay is 0, ensureProwJobForReleaseTag skips the splay gate + // entirely (guarded by `if splay > 0`), so splayDelay is never called + // with a zero duration. +} diff --git a/cmd/release-controller/sync_analysis.go b/cmd/release-controller/sync_analysis.go index bf4184d1b..a7a9874a0 100644 --- a/cmd/release-controller/sync_analysis.go +++ b/cmd/release-controller/sync_analysis.go @@ -14,7 +14,10 @@ const ( defaultAggregateProwJobName = "release-openshift-release-analysis-aggregator" ) -func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) error { +// launchAnalysisJobs creates analysis jobs for an aggregated verification step. +// It returns true if all analysis jobs have been created (or already exist), and +// false if some jobs were deferred by splay and still need to be created later. +func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) (bool, error) { jobLabels := map[string]string{ releasecontroller.ProwJobLabelCapability: "rce", "release.openshift.io/analysis": releaseTag.Name, @@ -25,15 +28,19 @@ func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, veri copied := verifyType.DeepCopy() copied.AggregatedProwJob.AnalysisJobCount = 0 + allCreated := true for i := range verifyType.AggregatedProwJob.AnalysisJobCount { // Postfix the name to differentiate it from the aggregator job jobNameSuffix := fmt.Sprintf("analysis-%d", i) - _, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels) + job, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels) if err != nil { - return err + return false, err + } + if job == nil { + allCreated = false } } - return nil + return allCreated, nil } func addAnalysisEnvToProwJobSpec(spec *prowjobv1.ProwJobSpec, payloadTag, verificationJobName string) (bool, error) { diff --git a/cmd/release-controller/sync_upgrade.go b/cmd/release-controller/sync_upgrade.go index b4d26240b..abc8973cb 100644 --- a/cmd/release-controller/sync_upgrade.go +++ b/cmd/release-controller/sync_upgrade.go @@ -81,6 +81,8 @@ func (c *Controller) ensureReleaseUpgradeJobs(release *releasecontroller.Release if err != nil { return err } + // nil job with nil error means splay deferred the job; the controller + // will re-process and create it when the delay elapses. } return nil } diff --git a/cmd/release-controller/sync_verify.go b/cmd/release-controller/sync_verify.go index ac4429a7d..da4723f24 100644 --- a/cmd/release-controller/sync_verify.go +++ b/cmd/release-controller/sync_verify.go @@ -90,16 +90,32 @@ func (c *Controller) ensureVerificationJobs(release *releasecontroller.Release, releasecontroller.ReleaseLabelPayload: releaseTag.Name, } if verifyType.AggregatedProwJob != nil { - err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec) + allAnalysisCreated, err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec) if err != nil { return nil, err } + if !allAnalysisCreated { + // Some analysis jobs are still deferred by splay; don't + // launch the aggregator yet. Re-queue to check again. + klog.V(4).Infof("Splay: deferring aggregator for %s/%s until all analysis jobs are created", releaseTag.Name, name) + if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay { + retryQueueDelay = 30 * time.Second + } + continue + } jobLabels["release.openshift.io/aggregator"] = releaseTag.Name } job, err := c.ensureProwJobForReleaseTag(release, name, jobNameSuffix, verifyType, releaseTag, previousTag, previousReleasePullSpec, jobLabels) if err != nil { return nil, err } + if job == nil { + // Job creation was deferred by splay; re-queue to check again + if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay { + retryQueueDelay = 30 * time.Second + } + continue + } status, ok := releasecontroller.ProwJobVerificationStatus(job) if !ok { return nil, fmt.Errorf("unexpected error accessing prow job definition") diff --git a/cmd/release-controller/sync_verify_prow.go b/cmd/release-controller/sync_verify_prow.go index 150ad2ef0..6bdfb462b 100644 --- a/cmd/release-controller/sync_verify_prow.go +++ b/cmd/release-controller/sync_verify_prow.go @@ -4,8 +4,10 @@ import ( "bytes" "context" "fmt" + "hash/fnv" "strconv" "strings" + "time" "github.com/openshift/release-controller/pkg/prow" @@ -55,6 +57,24 @@ func (c *Controller) ensureProwJobForReleaseTag(release *releasecontroller.Relea return obj.(*unstructured.Unstructured), nil } + // If splay is configured, defer job creation by a deterministic random delay + // derived from the payload tag and job name. This spreads out job launches to + // reduce the blast radius of transient infrastructure issues. + splay := time.Duration(release.Config.Splay) + if splay > 0 { + tagCreated, parseErr := time.Parse(time.RFC3339, releaseTag.Annotations[releasecontroller.ReleaseAnnotationCreationTimestamp]) + if parseErr == nil { + h := fnv.New32a() + h.Write([]byte(releaseTag.Name + "/" + prowJobName)) + delay := time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second + remaining := delay - time.Since(tagCreated) + if remaining > 0 { + klog.V(4).Infof("Splay: deferring job %s for %s (delay %s, tag created %s)", prowJobName, remaining.Truncate(time.Second), delay.Truncate(time.Second), tagCreated.Format(time.RFC3339)) + return nil, nil + } + } + } + // It is very important that you do not modify anything returned from c.prowConfigLoader.Config(): // https://github.com/kubernetes/test-infra/blob/a68e557705be5e4b7d3cf45bb4fe1e93b82c5682/prow/config/agent.go#L389 // These are in-memory copies of the mounted prow configurations and changing them results in subsequent lookups that diff --git a/pkg/release-controller/types.go b/pkg/release-controller/types.go index 771ff0828..ec7ab62fc 100644 --- a/pkg/release-controller/types.go +++ b/pkg/release-controller/types.go @@ -130,6 +130,13 @@ type ReleaseConfig struct { // should be expired and removed. If unset, tags are not expired. Expires utils.Duration `json:"expires"` + // Splay is the maximum random offset as a golang duration applied to verification + // job start times. Each job is assigned a deterministic random delay in [0, splay) + // based on the payload tag and job name. This spreads out job launches to reduce the + // blast radius of transient infrastructure issues like registry outages. If unset, + // all jobs start immediately. + Splay utils.Duration `json:"splay,omitempty"` + // Verify is a map of short names to verification steps that must succeed before the // release is Accepted. Failures for some job types will cause the release to be // rejected.