Skip to content

Commit 1998982

Browse files
committed
Merge branch 'slice-main' into evict_if_not_tolerated
2 parents bceef40 + 548d5a6 commit 1998982

2 files changed

Lines changed: 58 additions & 10 deletions

File tree

slice/internal/controller/workload_controller.go

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,10 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
186186

187187
if ac.State == kueue.CheckStateReady && (len(grouped.deleted) > 0 || len(slices) != totalDesiredSlices(wl, nodes)) {
188188
log.V(3).Info("Slice has been deleted, evicting workload")
189-
err := r.evictWorkload(ctx, wl, ac, "Slice has been deleted")
190-
return ctrl.Result{}, err
189+
if err := r.evictWorkload(ctx, wl, ac, "Slice has been deleted"); err != nil {
190+
return ctrl.Result{}, err
191+
}
192+
return ctrl.Result{}, r.deleteSlicesForEvictedWorkload(ctx, grouped)
191193
}
192194

193195
if len(grouped.deleted) > 0 {
@@ -217,6 +219,10 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
217219
return ctrl.Result{}, client.IgnoreNotFound(err)
218220
}
219221

222+
if ac.State == kueue.CheckStateRetry {
223+
return ctrl.Result{}, r.deleteSlicesForEvictedWorkload(ctx, grouped)
224+
}
225+
220226
// Delete any Slices that are in a failed or stale state.
221227
if len(grouped.toDelete) > 0 {
222228
log.V(2).Info(
@@ -393,6 +399,20 @@ func (r *WorkloadReconciler) deleteSlices(ctx context.Context, slices []*v1beta1
393399
return nil
394400
}
395401

402+
func (r *WorkloadReconciler) deleteSlicesForEvictedWorkload(ctx context.Context, grouped groupedSlices) error {
403+
numSlicesToDelete := len(grouped.active) + len(grouped.initializing) + len(grouped.toDelete)
404+
if numSlicesToDelete == 0 {
405+
return nil
406+
}
407+
log := ctrl.LoggerFrom(ctx)
408+
toDelete := make([]*v1beta1.Slice, 0, numSlicesToDelete)
409+
toDelete = append(toDelete, grouped.active...)
410+
toDelete = append(toDelete, grouped.initializing...)
411+
toDelete = append(toDelete, grouped.toDelete...)
412+
log.V(3).Info("AdmissionCheck is Retry, deleting all Slices")
413+
return r.deleteSlices(ctx, toDelete)
414+
}
415+
396416
func (r *WorkloadReconciler) ownerPodsFinished(ctx context.Context, wl *kueue.Workload) (bool, error) {
397417
if isJobSetOwner(wl) {
398418
return r.jobSetPodsFinished(ctx, wl)
@@ -551,6 +571,11 @@ func (r *WorkloadReconciler) syncSlices(
551571
slices []v1beta1.Slice,
552572
nodes map[string]corev1.Node,
553573
) ([]v1beta1.Slice, error) {
574+
// this is to prevent from creating slices when AC is Retry
575+
// and the workload still has the old Admission
576+
if ac.State != kueue.CheckStatePending {
577+
return nil, nil
578+
}
554579
existingSlicesByName := make(map[string]*v1beta1.Slice, len(slices))
555580
for _, slice := range slices {
556581
existingSlicesByName[slice.Name] = &slice
@@ -729,7 +754,7 @@ func (r *WorkloadReconciler) validatePartitionCount(
729754

730755
func (r *WorkloadReconciler) evictWorkload(ctx context.Context, wl *kueue.Workload, ac *kueue.AdmissionCheckState, message string) error {
731756
ac.State = kueue.CheckStateRetry
732-
ac.RequeueAfterSeconds = ptr.To(int32(10))
757+
ac.RequeueAfterSeconds = ptr.To(int32(r.retryDelayOnSliceFailure.Round(time.Second).Seconds()))
733758
ac.Message = api.TruncateConditionMessage(message)
734759
return r.updateWorkloadAdmissionCheckStatus(ctx, wl, ac)
735760
}
@@ -881,6 +906,10 @@ func buildAdmissionCheckMessage(slicesByState map[core.SliceState][]v1beta1.Slic
881906
}
882907

883908
func (r *WorkloadReconciler) prepareAdmissionCheckStatus(wl *kueue.Workload, ac *kueue.AdmissionCheckState, slices []v1beta1.Slice) {
909+
// wait for Kueue to reset check to Pending after eviction
910+
if ac.State == kueue.CheckStateRetry {
911+
return
912+
}
884913
slicesByState := groupSlicesByState(slices, r.activationTimeout)
885914

886915
podSetRequiresHealthy := make(map[string]bool)

slice/internal/controller/workload_controller_test.go

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -738,6 +738,32 @@ func TestWorkloadReconciler(t *testing.T) {
738738
Obj(),
739739
},
740740
},
741+
"should not create Slices (and delete existing) if AdmissionCheck is in Retry state": {
742+
request: baseRequest,
743+
objs: []client.Object{
744+
worker1Node.DeepCopy(),
745+
worker2Node.DeepCopy(),
746+
baseAdmissionCheckWrapper.DeepCopy(),
747+
baseWorkloadWrapper.Clone().
748+
PodSets(basePodSets...).
749+
ReserveQuota(baseAdmission, now).
750+
ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName).
751+
Finalizers(SliceControllerName).
752+
AdmissionCheck(buildAdmissionCheckStateWithRequeue(kueue.CheckStateRetry, "retrying", ptr.To(int32(10)))).
753+
Obj(),
754+
baseSlice1Wrapper.DeepCopy(),
755+
baseSlice2Wrapper.DeepCopy(),
756+
},
757+
wantWorkloads: []kueue.Workload{
758+
*baseWorkloadWrapper.Clone().
759+
PodSets(basePodSets...).
760+
ReserveQuota(baseAdmission, now).
761+
ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName).
762+
Finalizers(SliceControllerName).
763+
AdmissionCheck(buildAdmissionCheckStateWithRequeue(kueue.CheckStateRetry, "retrying", ptr.To(int32(10)))).
764+
Obj(),
765+
},
766+
},
741767
"shouldn't create a Slice because there’s no AdmissionCheck": {
742768
request: baseRequest,
743769
objs: []client.Object{
@@ -1680,9 +1706,6 @@ func TestWorkloadReconciler(t *testing.T) {
16801706
`Slices are in states: 1 ACTIVE, 1 FAILED. Errors: Error by test`, ptr.To(int32(10)))).
16811707
Obj(),
16821708
},
1683-
wantSlices: []slice.Slice{
1684-
*baseSlice1Wrapper.Clone().Active().Obj(),
1685-
},
16861709
wantEvents: []utiltesting.EventRecord{
16871710
buildEventRecord(corev1.NamespaceDefault, corev1.EventTypeNormal, AdmissionCheckUpdatedEventType,
16881711
fmt.Sprintf(`Admission check %q updated state from "Pending" to "Retry"`, baseACName)),
@@ -1741,9 +1764,6 @@ func TestWorkloadReconciler(t *testing.T) {
17411764
"Slice has been deleted", ptr.To(int32(10)))).
17421765
Obj(),
17431766
},
1744-
wantSlices: []slice.Slice{
1745-
*baseSlice2Wrapper.Clone().Active().Obj(),
1746-
},
17471767
},
17481768
"should evict workload if slice is deleted unexpectedly": {
17491769
request: baseRequest,
@@ -1773,7 +1793,6 @@ func TestWorkloadReconciler(t *testing.T) {
17731793
},
17741794
wantSlices: []slice.Slice{
17751795
*baseSlice1Wrapper.Clone().Active().DeletionTimestamp(now).Finalizers("accelerator.gke.io/slice-finalizer").Obj(),
1776-
*baseSlice2Wrapper.Clone().Active().Obj(),
17771796
},
17781797
},
17791798
"should use the first AdmissionCheck if more than one is found": {

0 commit comments

Comments
 (0)