@@ -186,8 +186,10 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
186186
187187 if ac .State == kueue .CheckStateReady && (len (grouped .deleted ) > 0 || len (slices ) != totalDesiredSlices (wl , nodes )) {
188188 log .V (3 ).Info ("Slice has been deleted, evicting workload" )
189- err := r .evictWorkload (ctx , wl , ac , "Slice has been deleted" )
190- return ctrl.Result {}, err
189+ if err := r .evictWorkload (ctx , wl , ac , "Slice has been deleted" ); err != nil {
190+ return ctrl.Result {}, err
191+ }
192+ return ctrl.Result {}, r .deleteSlicesForEvictedWorkload (ctx , grouped )
191193 }
192194
193195 if len (grouped .deleted ) > 0 {
@@ -217,6 +219,10 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
217219 return ctrl.Result {}, client .IgnoreNotFound (err )
218220 }
219221
222+ if ac .State == kueue .CheckStateRetry {
223+ return ctrl.Result {}, r .deleteSlicesForEvictedWorkload (ctx , grouped )
224+ }
225+
220226 // Delete any Slices that are in a failed or stale state.
221227 if len (grouped .toDelete ) > 0 {
222228 log .V (2 ).Info (
@@ -393,6 +399,20 @@ func (r *WorkloadReconciler) deleteSlices(ctx context.Context, slices []*v1beta1
393399 return nil
394400}
395401
402+ func (r * WorkloadReconciler ) deleteSlicesForEvictedWorkload (ctx context.Context , grouped groupedSlices ) error {
403+ numSlicesToDelete := len (grouped .active ) + len (grouped .initializing ) + len (grouped .toDelete )
404+ if numSlicesToDelete == 0 {
405+ return nil
406+ }
407+ log := ctrl .LoggerFrom (ctx )
408+ toDelete := make ([]* v1beta1.Slice , 0 , numSlicesToDelete )
409+ toDelete = append (toDelete , grouped .active ... )
410+ toDelete = append (toDelete , grouped .initializing ... )
411+ toDelete = append (toDelete , grouped .toDelete ... )
412+ log .V (3 ).Info ("AdmissionCheck is Retry, deleting all Slices" )
413+ return r .deleteSlices (ctx , toDelete )
414+ }
415+
396416func (r * WorkloadReconciler ) ownerPodsFinished (ctx context.Context , wl * kueue.Workload ) (bool , error ) {
397417 if isJobSetOwner (wl ) {
398418 return r .jobSetPodsFinished (ctx , wl )
@@ -551,6 +571,11 @@ func (r *WorkloadReconciler) syncSlices(
551571 slices []v1beta1.Slice ,
552572 nodes map [string ]corev1.Node ,
553573) ([]v1beta1.Slice , error ) {
574+ // this is to prevent from creating slices when AC is Retry
575+ // and the workload still has the old Admission
576+ if ac .State != kueue .CheckStatePending {
577+ return nil , nil
578+ }
554579 existingSlicesByName := make (map [string ]* v1beta1.Slice , len (slices ))
555580 for _ , slice := range slices {
556581 existingSlicesByName [slice .Name ] = & slice
@@ -729,7 +754,7 @@ func (r *WorkloadReconciler) validatePartitionCount(
729754
730755func (r * WorkloadReconciler ) evictWorkload (ctx context.Context , wl * kueue.Workload , ac * kueue.AdmissionCheckState , message string ) error {
731756 ac .State = kueue .CheckStateRetry
732- ac .RequeueAfterSeconds = ptr .To (int32 (10 ))
757+ ac .RequeueAfterSeconds = ptr .To (int32 (r . retryDelayOnSliceFailure . Round ( time . Second ). Seconds () ))
733758 ac .Message = api .TruncateConditionMessage (message )
734759 return r .updateWorkloadAdmissionCheckStatus (ctx , wl , ac )
735760}
@@ -881,6 +906,10 @@ func buildAdmissionCheckMessage(slicesByState map[core.SliceState][]v1beta1.Slic
881906}
882907
883908func (r * WorkloadReconciler ) prepareAdmissionCheckStatus (wl * kueue.Workload , ac * kueue.AdmissionCheckState , slices []v1beta1.Slice ) {
909+ // wait for Kueue to reset check to Pending after eviction
910+ if ac .State == kueue .CheckStateRetry {
911+ return
912+ }
884913 slicesByState := groupSlicesByState (slices , r .activationTimeout )
885914
886915 podSetRequiresHealthy := make (map [string ]bool )
0 commit comments