@@ -165,18 +165,26 @@ func TestWorkloadReconciler(t *testing.T) {
165165 Type (slice .TypeTpu7x ).
166166 Topology ("4x4x4" ).
167167 OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).
168+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
168169 PartitionIDs ("subblock1" )
169170 baseSlice2Wrapper := baseSlice1Wrapper .Clone ().Name (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "ps2" , 0 )).
170171 Type (slice .TypeTpu7x ).
171172 Topology ("4x4x4" ).
172173 OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).
174+ Annotation (core .OwnerPodSetNameAnnotation , "ps2" ).
173175 PartitionIDs ("subblock2" )
174176
175177 worker1Node := utiltesting .MakeNode ("worker1" ).Label (core .TPUSubBlockLabel , "subblock1" )
176178 worker2Node := utiltesting .MakeNode ("worker2" ).Label (core .TPUSubBlockLabel , "subblock2" )
177179 worker3Node := utiltesting .MakeNode ("worker3" ).Label (core .TPUSubBlockLabel , "subblock3" )
178180 worker4Node := utiltesting .MakeNode ("worker4" ).Label (core .TPUSubBlockLabel , "subblock4" )
179181
182+ podSetRequiringHealthy := basePodSet1Wrapper .DeepCopy ()
183+ if podSetRequiringHealthy .Template .Spec .NodeSelector == nil {
184+ podSetRequiringHealthy .Template .Spec .NodeSelector = make (map [string ]string )
185+ }
186+ podSetRequiringHealthy .Template .Spec .NodeSelector [core .TPUSliceHealthNodeSelectorKey ] = core .TPUSliceHealthNodeSelectorHealthy
187+
180188 testCases := map [string ]struct {
181189 interceptorFuncsCreate func (ctx context.Context , client client.WithWatch , obj client.Object , opts ... client.CreateOption ) error
182190 request types.NamespacedName
@@ -1107,9 +1115,9 @@ func TestWorkloadReconciler(t *testing.T) {
11071115 Obj (),
11081116 },
11091117 wantSlices : []slice.Slice {
1110- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock2" ).Obj (),
1111- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock3" ).Obj (),
1112- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock1" ).Obj (),
1118+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj1" ). PartitionIDs ("subblock2" ).Obj (),
1119+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj1" ). PartitionIDs ("subblock3" ).Obj (),
1120+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj2" ). PartitionIDs ("subblock1" ).Obj (),
11131121 },
11141122 wantEvents : []utiltesting.EventRecord {buildEventRecord (corev1 .NamespaceDefault , corev1 .EventTypeNormal , SlicesCreatedEventType , `The Slices "default-workload-rj1-0", "default-workload-rj1-1", "default-workload-rj2-0" have been created` )},
11151123 wantResult : reconcile.Result {RequeueAfter : initializationRetryAfter },
@@ -1187,10 +1195,10 @@ func TestWorkloadReconciler(t *testing.T) {
11871195 Obj (),
11881196 },
11891197 wantSlices : []slice.Slice {
1190- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock1" ).Obj (),
1191- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock2" ).Obj (),
1192- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock3" ).Obj (),
1193- * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).PartitionIDs ("subblock4" ).Obj (),
1198+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj1" ). PartitionIDs ("subblock1" ).Obj (),
1199+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj1" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj1" ). PartitionIDs ("subblock2" ).Obj (),
1200+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 0 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj2" ). PartitionIDs ("subblock3" ).Obj (),
1201+ * utiltesting .MakeSliceWrapper (core .SliceName (corev1 .NamespaceDefault , baseWorkloadName , "rj2" , 1 )).Type (slice .TypeTpu7x ).Topology ("4x4x4" ).OwnerWorkloadAnnotations (corev1 .NamespaceDefault , baseWorkloadName ).Annotation ( core . OwnerPodSetNameAnnotation , "rj2" ). PartitionIDs ("subblock4" ).Obj (),
11941202 },
11951203 wantEvents : []utiltesting.EventRecord {buildEventRecord (corev1 .NamespaceDefault , corev1 .EventTypeNormal , SlicesCreatedEventType , `The Slices "default-workload-rj1-0", "default-workload-rj1-1", "default-workload-rj2-0", "default-workload-rj2-1" have been created` )},
11961204 wantResult : reconcile.Result {RequeueAfter : initializationRetryAfter },
@@ -1848,6 +1856,7 @@ func TestWorkloadReconciler(t *testing.T) {
18481856 Type (slice .TypeTpu7x ).
18491857 Topology ("4x4x4" ).
18501858 OwnerWorkloadAnnotations ("namespace1" , baseWorkloadName ).
1859+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
18511860 PartitionIDs ("subblock1" ).
18521861 Active ().
18531862 Obj (),
@@ -1883,13 +1892,15 @@ func TestWorkloadReconciler(t *testing.T) {
18831892 Type (slice .TypeTpu7x ).
18841893 Topology ("4x4x4" ).
18851894 OwnerWorkloadAnnotations ("namespace1" , baseWorkloadName ).
1895+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
18861896 PartitionIDs ("subblock1" ).
18871897 Active ().
18881898 Obj (),
18891899 * utiltesting .MakeSliceWrapper (core .SliceName ("namespace2" , baseWorkloadName , "ps2" , 0 )).
18901900 Type (slice .TypeTpu7x ).
18911901 Topology ("4x4x4" ).
18921902 OwnerWorkloadAnnotations ("namespace2" , baseWorkloadName ).
1903+ Annotation (core .OwnerPodSetNameAnnotation , "ps2" ).
18931904 PartitionIDs ("subblock2" ).
18941905 Obj (),
18951906 },
@@ -1948,6 +1959,126 @@ func TestWorkloadReconciler(t *testing.T) {
19481959 Obj (),
19491960 },
19501961 },
1962+ "should retry if a PodSet requiring healthy slices gets a degraded slice" : {
1963+ request : baseRequest ,
1964+ objs : []client.Object {
1965+ worker1Node .DeepCopy (),
1966+ worker2Node .DeepCopy (),
1967+ baseAdmissionCheckWrapper .DeepCopy (),
1968+ baseWorkloadWrapper .Clone ().
1969+ PodSets (
1970+ * podSetRequiringHealthy ,
1971+ * basePodSet2Wrapper .DeepCopy (),
1972+ ).
1973+ ReserveQuota (baseAdmission , now ).
1974+ ControllerReference (jobSetGVK , baseJobSetName , baseJobSetName ).
1975+ Finalizers (SliceControllerName ).
1976+ Obj (),
1977+ baseJobSetWrapper .Clone ().Obj (),
1978+ baseSlice1Wrapper .Clone ().
1979+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
1980+ Condition (metav1.Condition {
1981+ Type : slice .SliceStateConditionType ,
1982+ Status : metav1 .ConditionTrue ,
1983+ Reason : string (core .MMIGHealthStatusActiveDegraded ),
1984+ Message : "Hardware failure" ,
1985+ }).Obj (),
1986+ baseSlice2Wrapper .Clone ().
1987+ Active ().
1988+ Annotation (core .OwnerPodSetNameAnnotation , "ps2" ).
1989+ Obj (),
1990+ },
1991+ wantWorkloads : []kueue.Workload {
1992+ * baseWorkloadWrapper .Clone ().
1993+ PodSets (
1994+ * podSetRequiringHealthy ,
1995+ * basePodSet2Wrapper .DeepCopy (),
1996+ ).
1997+ ReserveQuota (baseAdmission , now ).
1998+ ControllerReference (jobSetGVK , baseJobSetName , baseJobSetName ).
1999+ Finalizers (SliceControllerName ).
2000+ AdmissionCheck (buildAdmissionCheckStateWithRequeue (kueue .CheckStateRetry ,
2001+ `Slices are in states: 1 ACTIVE, 1 ACTIVE_DEGRADED. Errors: Hardware failure (degraded)` , ptr .To (int32 (10 )))).
2002+ Obj (),
2003+ },
2004+ wantJobSets : []jobset.JobSet {* baseJobSetWrapper .Clone ().Obj ()},
2005+ wantEvents : []utiltesting.EventRecord {
2006+ buildEventRecord (corev1 .NamespaceDefault , corev1 .EventTypeNormal , AdmissionCheckUpdatedEventType ,
2007+ fmt .Sprintf (`Admission check %q updated state from "Pending" to "Retry"` , baseACName )),
2008+ },
2009+ },
2010+ "should be ready if a PodSet tolerating degraded slices gets a degraded slice" : {
2011+ request : baseRequest ,
2012+ objs : []client.Object {
2013+ worker1Node .DeepCopy (),
2014+ worker2Node .DeepCopy (),
2015+ baseAdmissionCheckWrapper .DeepCopy (),
2016+ baseWorkloadWrapper .Clone ().
2017+ PodSets (
2018+ * podSetRequiringHealthy ,
2019+ * basePodSet2Wrapper .DeepCopy (),
2020+ ).
2021+ ReserveQuota (baseAdmission , now ).
2022+ ControllerReference (jobSetGVK , baseJobSetName , baseJobSetName ).
2023+ Finalizers (SliceControllerName ).
2024+ Obj (),
2025+ baseJobSetWrapper .Clone ().Obj (),
2026+ baseSlice1Wrapper .Clone ().
2027+ Active ().
2028+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
2029+ Obj (),
2030+ baseSlice2Wrapper .Clone ().
2031+ Annotation (core .OwnerPodSetNameAnnotation , "ps2" ).
2032+ Condition (metav1.Condition {
2033+ Type : slice .SliceStateConditionType ,
2034+ Status : metav1 .ConditionTrue ,
2035+ Reason : string (core .MMIGHealthStatusActiveDegraded ),
2036+ Message : "Hardware failure" ,
2037+ }).Obj (),
2038+ },
2039+ wantWorkloads : []kueue.Workload {
2040+ * baseWorkloadWrapper .Clone ().
2041+ PodSets (
2042+ * podSetRequiringHealthy ,
2043+ * basePodSet2Wrapper .DeepCopy (),
2044+ ).
2045+ ReserveQuota (baseAdmission , now ).
2046+ ControllerReference (jobSetGVK , baseJobSetName , baseJobSetName ).
2047+ Finalizers (SliceControllerName ).
2048+ AdmissionCheck (buildAdmissionCheckStateWithPodSetUpdates (kueue .CheckStateReady ,
2049+ `Slices are in states: 1 ACTIVE, 1 ACTIVE_DEGRADED` ,
2050+ []kueue.PodSetUpdate {
2051+ {
2052+ Name : "ps1" ,
2053+ NodeSelector : map [string ]string {"cloud.google.com/gke-tpu-topology" : "4x4x4" },
2054+ },
2055+ {
2056+ Name : "ps2" ,
2057+ NodeSelector : map [string ]string {"cloud.google.com/gke-tpu-topology" : "4x4x4" },
2058+ },
2059+ })).
2060+ Obj (),
2061+ },
2062+ wantSlices : []slice.Slice {
2063+ * baseSlice1Wrapper .Clone ().
2064+ Active ().
2065+ Annotation (core .OwnerPodSetNameAnnotation , "ps1" ).
2066+ Obj (),
2067+ * baseSlice2Wrapper .Clone ().
2068+ Annotation (core .OwnerPodSetNameAnnotation , "ps2" ).
2069+ Condition (metav1.Condition {
2070+ Type : slice .SliceStateConditionType ,
2071+ Status : metav1 .ConditionTrue ,
2072+ Reason : string (core .MMIGHealthStatusActiveDegraded ),
2073+ Message : "Hardware failure" ,
2074+ }).Obj (),
2075+ },
2076+ wantJobSets : []jobset.JobSet {* baseJobSetWrapper .Clone ().Obj ()},
2077+ wantEvents : []utiltesting.EventRecord {
2078+ buildEventRecord (corev1 .NamespaceDefault , corev1 .EventTypeNormal , AdmissionCheckUpdatedEventType ,
2079+ fmt .Sprintf (`Admission check %q updated state from "Pending" to "Ready"` , baseACName )),
2080+ },
2081+ },
19512082 }
19522083 for name , tc := range testCases {
19532084 t .Run (name , func (t * testing.T ) {
0 commit comments