diff --git a/cmd/release-controller/splay_test.go b/cmd/release-controller/splay_test.go new file mode 100644 index 000000000..0a1687183 --- /dev/null +++ b/cmd/release-controller/splay_test.go @@ -0,0 +1,64 @@ +package main + +import ( + "hash/fnv" + "testing" + "time" +) + +// splayDelay mirrors the delay computation in ensureProwJobForReleaseTag +func splayDelay(tagName, prowJobName string, splay time.Duration) time.Duration { + h := fnv.New32a() + h.Write([]byte(tagName + "/" + prowJobName)) + return time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second +} + +func TestSplayDelay(t *testing.T) { + splay := 15 * time.Minute + + t.Run("deterministic", func(t *testing.T) { + d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay) + d2 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "job-a", splay) + if d1 != d2 { + t.Errorf("same inputs produced different delays: %v vs %v", d1, d2) + } + }) + + t.Run("within bounds", func(t *testing.T) { + for i := range 100 { + tag := "4.22.0-0.nightly-2026-03-11-124441" + job := "job-" + string(rune('a'+i)) + d := splayDelay(tag, job, splay) + if d < 0 || d >= splay { + t.Errorf("delay %v out of range [0, %v) for job %s", d, splay, job) + } + } + }) + + t.Run("different jobs get different delays", func(t *testing.T) { + tag := "4.22.0-0.nightly-2026-03-11-124441" + delays := make(map[time.Duration]int) + for i := range 30 { + job := "aggregated-e2e-aws-analysis-" + string(rune('0'+i)) + d := splayDelay(tag, job, splay) + delays[d]++ + } + // With 30 jobs across 900 seconds, we should get reasonable distribution. + // At minimum, not all delays should be identical. + if len(delays) < 2 { + t.Errorf("expected varied delays across 30 jobs, got %d distinct values", len(delays)) + } + }) + + t.Run("different payloads get different delays for same job", func(t *testing.T) { + d1 := splayDelay("4.22.0-0.nightly-2026-03-11-124441", "e2e-aws", splay) + d2 := splayDelay("4.22.0-0.nightly-2026-03-12-060000", "e2e-aws", splay) + if d1 == d2 { + t.Logf("note: same delay for different payloads (possible but unlikely): %v", d1) + } + }) + + // Note: when splay is 0, ensureProwJobForReleaseTag skips the splay gate + // entirely (guarded by `if splay > 0`), so splayDelay is never called + // with a zero duration. +} diff --git a/cmd/release-controller/sync_analysis.go b/cmd/release-controller/sync_analysis.go index bf4184d1b..a7a9874a0 100644 --- a/cmd/release-controller/sync_analysis.go +++ b/cmd/release-controller/sync_analysis.go @@ -14,7 +14,10 @@ const ( defaultAggregateProwJobName = "release-openshift-release-analysis-aggregator" ) -func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) error { +// launchAnalysisJobs creates analysis jobs for an aggregated verification step. +// It returns true if all analysis jobs have been created (or already exist), and +// false if some jobs were deferred by splay and still need to be created later. +func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, verifyName string, verifyType releasecontroller.ReleaseVerification, releaseTag *imagev1.TagReference, previousTag, previousReleasePullSpec string) (bool, error) { jobLabels := map[string]string{ releasecontroller.ProwJobLabelCapability: "rce", "release.openshift.io/analysis": releaseTag.Name, @@ -25,15 +28,19 @@ func (c *Controller) launchAnalysisJobs(release *releasecontroller.Release, veri copied := verifyType.DeepCopy() copied.AggregatedProwJob.AnalysisJobCount = 0 + allCreated := true for i := range verifyType.AggregatedProwJob.AnalysisJobCount { // Postfix the name to differentiate it from the aggregator job jobNameSuffix := fmt.Sprintf("analysis-%d", i) - _, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels) + job, err := c.ensureProwJobForReleaseTag(release, verifyName, jobNameSuffix, *copied, releaseTag, previousTag, previousReleasePullSpec, jobLabels) if err != nil { - return err + return false, err + } + if job == nil { + allCreated = false } } - return nil + return allCreated, nil } func addAnalysisEnvToProwJobSpec(spec *prowjobv1.ProwJobSpec, payloadTag, verificationJobName string) (bool, error) { diff --git a/cmd/release-controller/sync_upgrade.go b/cmd/release-controller/sync_upgrade.go index b4d26240b..abc8973cb 100644 --- a/cmd/release-controller/sync_upgrade.go +++ b/cmd/release-controller/sync_upgrade.go @@ -81,6 +81,8 @@ func (c *Controller) ensureReleaseUpgradeJobs(release *releasecontroller.Release if err != nil { return err } + // nil job with nil error means splay deferred the job; the controller + // will re-process and create it when the delay elapses. } return nil } diff --git a/cmd/release-controller/sync_verify.go b/cmd/release-controller/sync_verify.go index ac4429a7d..da4723f24 100644 --- a/cmd/release-controller/sync_verify.go +++ b/cmd/release-controller/sync_verify.go @@ -90,16 +90,32 @@ func (c *Controller) ensureVerificationJobs(release *releasecontroller.Release, releasecontroller.ReleaseLabelPayload: releaseTag.Name, } if verifyType.AggregatedProwJob != nil { - err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec) + allAnalysisCreated, err := c.launchAnalysisJobs(release, name, verifyType, releaseTag, previousTag, previousReleasePullSpec) if err != nil { return nil, err } + if !allAnalysisCreated { + // Some analysis jobs are still deferred by splay; don't + // launch the aggregator yet. Re-queue to check again. + klog.V(4).Infof("Splay: deferring aggregator for %s/%s until all analysis jobs are created", releaseTag.Name, name) + if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay { + retryQueueDelay = 30 * time.Second + } + continue + } jobLabels["release.openshift.io/aggregator"] = releaseTag.Name } job, err := c.ensureProwJobForReleaseTag(release, name, jobNameSuffix, verifyType, releaseTag, previousTag, previousReleasePullSpec, jobLabels) if err != nil { return nil, err } + if job == nil { + // Job creation was deferred by splay; re-queue to check again + if retryQueueDelay == 0 || 30*time.Second < retryQueueDelay { + retryQueueDelay = 30 * time.Second + } + continue + } status, ok := releasecontroller.ProwJobVerificationStatus(job) if !ok { return nil, fmt.Errorf("unexpected error accessing prow job definition") diff --git a/cmd/release-controller/sync_verify_prow.go b/cmd/release-controller/sync_verify_prow.go index 150ad2ef0..6bdfb462b 100644 --- a/cmd/release-controller/sync_verify_prow.go +++ b/cmd/release-controller/sync_verify_prow.go @@ -4,8 +4,10 @@ import ( "bytes" "context" "fmt" + "hash/fnv" "strconv" "strings" + "time" "github.com/openshift/release-controller/pkg/prow" @@ -55,6 +57,24 @@ func (c *Controller) ensureProwJobForReleaseTag(release *releasecontroller.Relea return obj.(*unstructured.Unstructured), nil } + // If splay is configured, defer job creation by a deterministic random delay + // derived from the payload tag and job name. This spreads out job launches to + // reduce the blast radius of transient infrastructure issues. + splay := time.Duration(release.Config.Splay) + if splay > 0 { + tagCreated, parseErr := time.Parse(time.RFC3339, releaseTag.Annotations[releasecontroller.ReleaseAnnotationCreationTimestamp]) + if parseErr == nil { + h := fnv.New32a() + h.Write([]byte(releaseTag.Name + "/" + prowJobName)) + delay := time.Duration(h.Sum32()%uint32(splay.Seconds())) * time.Second + remaining := delay - time.Since(tagCreated) + if remaining > 0 { + klog.V(4).Infof("Splay: deferring job %s for %s (delay %s, tag created %s)", prowJobName, remaining.Truncate(time.Second), delay.Truncate(time.Second), tagCreated.Format(time.RFC3339)) + return nil, nil + } + } + } + // It is very important that you do not modify anything returned from c.prowConfigLoader.Config(): // https://github.com/kubernetes/test-infra/blob/a68e557705be5e4b7d3cf45bb4fe1e93b82c5682/prow/config/agent.go#L389 // These are in-memory copies of the mounted prow configurations and changing them results in subsequent lookups that diff --git a/pkg/release-controller/types.go b/pkg/release-controller/types.go index 771ff0828..ec7ab62fc 100644 --- a/pkg/release-controller/types.go +++ b/pkg/release-controller/types.go @@ -130,6 +130,13 @@ type ReleaseConfig struct { // should be expired and removed. If unset, tags are not expired. Expires utils.Duration `json:"expires"` + // Splay is the maximum random offset as a golang duration applied to verification + // job start times. Each job is assigned a deterministic random delay in [0, splay) + // based on the payload tag and job name. This spreads out job launches to reduce the + // blast radius of transient infrastructure issues like registry outages. If unset, + // all jobs start immediately. + Splay utils.Duration `json:"splay,omitempty"` + // Verify is a map of short names to verification steps that must succeed before the // release is Accepted. Failures for some job types will cause the release to be // rejected.