Skip to content

Commit 3bbf2b6

Browse files
committed
WIP
1 parent 09c9b0a commit 3bbf2b6

3 files changed

Lines changed: 2 additions & 149 deletions

File tree

slice/internal/controller/workload_controller.go

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ import (
5050

5151
"tpu-slice-controller/api/v1beta1"
5252
"tpu-slice-controller/internal/core"
53+
"tpu-slice-controller/internal/features"
5354
"tpu-slice-controller/internal/topology"
5455
"tpu-slice-controller/internal/util/api"
5556
"tpu-slice-controller/internal/util/node"
@@ -851,13 +852,8 @@ func buildPodSetUpdates(wl *kueue.Workload) []kueue.PodSetUpdate {
851852
var podSetUpdates []kueue.PodSetUpdate
852853
for _, ps := range wl.Spec.PodSets {
853854
if topology := core.GetTPUTopology(ps.Template); topology != "" {
854-
labels := make(map[string]string)
855-
if features.Enabled(features.NodesInSlicesAntiAffinity) {
856-
labels[core.PodWebhookLabelKey] = "true"
857-
}
858855
podSetUpdates = append(podSetUpdates, kueue.PodSetUpdate{
859-
Name: ps.Name,
860-
Labels: labels,
856+
Name: ps.Name,
861857
NodeSelector: map[string]string{
862858
core.TPUTopologyAnnotation: topology,
863859
},

slice/internal/features/features.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,16 +25,11 @@ import (
2525
)
2626

2727
const (
28-
// Adds AntiAffinity that excludes nodes belonging to existing slices from scheduling.
29-
NodesInSlicesAntiAffinity featuregate.Feature = "NodesInSlicesAntiAffinity"
3028
// FailOnUntoleratedDegradedSlice treats degraded slices as failed if the workload requested only healthy slices.
3129
FailOnUntoleratedDegradedSlice featuregate.Feature = "FailOnUntoleratedDegradedSlice"
3230
)
3331

3432
var defaultVersionedFeatureGates = map[featuregate.Feature]featuregate.VersionedSpecs{
35-
NodesInSlicesAntiAffinity: {
36-
{Version: version.MustParse("0.1"), Default: true, PreRelease: featuregate.Alpha},
37-
},
3833
FailOnUntoleratedDegradedSlice: {
3934
{Version: version.MustParse("0.1"), Default: true, PreRelease: featuregate.Alpha},
4035
},

slice/test/e2e/jobset_test.go

Lines changed: 0 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,144 +1478,6 @@ var _ = ginkgo.Describe("JobSet", func() {
14781478
})
14791479
})
14801480

1481-
ginkgo.It("should only admit the workload if nodes are not used in some other slices", func() {
1482-
nodes := &corev1.NodeList{}
1483-
gomega.Expect(k8sClient.List(ctx, nodes, client.HasLabels{core.TPUSubBlockLabel})).To(gomega.Succeed())
1484-
var partitionIDs []string
1485-
for _, node := range nodes.Items {
1486-
partitionIDs = append(partitionIDs, node.Labels[core.TPUSubBlockLabel])
1487-
}
1488-
gomega.Expect(partitionIDs).ToNot(gomega.BeEmpty())
1489-
manualSlice := &slice.Slice{
1490-
ObjectMeta: metav1.ObjectMeta{
1491-
Name: "manual-slice-blocking",
1492-
},
1493-
Spec: slice.SliceSpec{
1494-
Type: slice.TypeTpu7x,
1495-
Topology: "4x4x4",
1496-
PartitionIds: partitionIDs,
1497-
},
1498-
}
1499-
utils.MustCreate(ctx, k8sClient, manualSlice)
1500-
utils.SetSliceReady(ctx, k8sClient, client.ObjectKeyFromObject(manualSlice), manualSlice.Spec.Topology)
1501-
1502-
jobSet := testingjobsjobset.MakeJobSet("jobset", ns.Name).
1503-
Queue(lq.Name).
1504-
ReplicatedJobs(
1505-
testingjobsjobset.ReplicatedJobRequirements{
1506-
Name: "rj1",
1507-
Image: utils.E2eTestAgnHostImage,
1508-
Args: utils.BehaviorWaitForDeletion,
1509-
Replicas: 1,
1510-
Parallelism: 16,
1511-
Completions: 16,
1512-
PodAnnotations: map[string]string{
1513-
core.TPUSliceTopologyAnnotation: "4x4x4",
1514-
},
1515-
NodeSelector: map[string]string{
1516-
core.TPUAcceleratorLabel: string(slice.TypeTpu7x),
1517-
core.TPUSliceHealthNodeSelectorKey: core.TPUSliceHealthNodeSelectorHealthy,
1518-
},
1519-
},
1520-
).
1521-
RequestAndLimit("rj1", core.TPUResourceName, "4").
1522-
Obj()
1523-
1524-
ginkgo.By("Creating a JobSet", func() {
1525-
utils.MustCreate(ctx, k8sClient, jobSet)
1526-
})
1527-
1528-
createdWorkload := &kueue.Workload{}
1529-
wlKey := types.NamespacedName{
1530-
Name: jobsetcontroller.GetWorkloadNameForJobSet(jobSet.Name, jobSet.UID),
1531-
Namespace: ns.Name,
1532-
}
1533-
1534-
ginkgo.By("Check that the Workload is not admissible", func() {
1535-
gomega.Consistently(func(g gomega.Gomega) {
1536-
err := k8sClient.Get(ctx, wlKey, createdWorkload)
1537-
if err != nil {
1538-
g.Expect(client.IgnoreNotFound(err)).To(gomega.Succeed())
1539-
} else {
1540-
g.Expect(createdWorkload.Status.Admission).Should(gomega.BeNil())
1541-
}
1542-
}, utils.ConsistentDuration, utils.Interval).Should(gomega.Succeed())
1543-
})
1544-
1545-
ginkgo.By("Deleting the manual slice and unlabeling nodes", func() {
1546-
utils.ExpectObjectToBeDeleted(ctx, k8sClient, manualSlice, true)
1547-
for _, node := range nodes.Items {
1548-
gomega.Eventually(func(g gomega.Gomega) {
1549-
n := &corev1.Node{}
1550-
g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(&node), n)).To(gomega.Succeed())
1551-
delete(n.Labels, core.TPUSliceNodeLabel)
1552-
delete(n.Labels, core.TPUTopologyAnnotation)
1553-
g.Expect(k8sClient.Update(ctx, n)).To(gomega.Succeed())
1554-
}, utils.Timeout, utils.Interval).Should(gomega.Succeed())
1555-
}
1556-
})
1557-
1558-
ginkgo.By("Waiting for Admission of the Workload", func() {
1559-
gomega.Eventually(func(g gomega.Gomega) {
1560-
g.Expect(k8sClient.Get(ctx, wlKey, createdWorkload)).Should(gomega.Succeed())
1561-
g.Expect(createdWorkload.Status.Admission).ShouldNot(gomega.BeNil())
1562-
}, utils.Timeout, utils.Interval).Should(gomega.Succeed())
1563-
})
1564-
1565-
createdSlice := &slice.Slice{}
1566-
sliceKey := core.SliceKeyFromWorkload(createdWorkload, "rj1", 0)
1567-
1568-
ginkgo.By("Checking that Slice is created", func() {
1569-
gomega.Eventually(func(g gomega.Gomega) {
1570-
g.Expect(k8sClient.Get(ctx, sliceKey, createdSlice)).To(gomega.Succeed())
1571-
g.Expect(createdSlice.Spec.PartitionIds).To(gomega.HaveLen(1))
1572-
g.Expect(createdSlice.Spec.Topology).To(gomega.Equal("4x4x4"))
1573-
g.Expect(createdSlice.Spec.Type).To(gomega.Equal(slice.TypeTpu7x))
1574-
}, utils.Timeout, utils.Interval).Should(gomega.Succeed())
1575-
})
1576-
1577-
ginkgo.By("Adding Ready condition", func() {
1578-
utils.SetSliceReady(ctx, k8sClient, sliceKey, "4x4x4")
1579-
})
1580-
1581-
ginkgo.By("Checking that the Workload is admitted and admission check status is ready", func() {
1582-
gomega.Eventually(func(g gomega.Gomega) {
1583-
g.Expect(k8sClient.Get(ctx, wlKey, createdWorkload)).Should(gomega.Succeed())
1584-
g.Expect(workload.IsAdmitted(createdWorkload)).Should(gomega.BeTrue())
1585-
g.Expect(createdWorkload.Status.AdmissionChecks).Should(gomega.BeComparableTo([]kueue.AdmissionCheckState{{
1586-
Name: kueue.AdmissionCheckReference(ac.Name),
1587-
State: kueue.CheckStateReady,
1588-
Message: `Slices are in states: 1 ACTIVE`,
1589-
}}, cmpopts.IgnoreFields(kueue.AdmissionCheckState{}, "LastTransitionTime", "PodSetUpdates")))
1590-
}, utils.LongTimeout, utils.Timeout).Should(gomega.Succeed())
1591-
})
1592-
1593-
ginkgo.By("Checking that all pods are running with topology node selector and without anti-affinity", func() {
1594-
pods := &corev1.PodList{}
1595-
gomega.Eventually(func(g gomega.Gomega) {
1596-
g.Expect(k8sClient.List(ctx, pods, client.InNamespace(ns.Name))).To(gomega.Succeed())
1597-
g.Expect(pods.Items).Should(gomega.HaveLen(int(16)))
1598-
for _, pod := range pods.Items {
1599-
g.Expect(pod.Spec.NodeSelector).To(gomega.HaveKeyWithValue(core.TPUTopologyAnnotation, "4x4x4"))
1600-
g.Expect(pod.Spec.Affinity).To(gomega.BeNil())
1601-
g.Expect(pod.Status.Phase).To(gomega.Equal(corev1.PodRunning))
1602-
}
1603-
}, utils.LongTimeout, utils.Interval).Should(gomega.Succeed())
1604-
})
1605-
1606-
ginkgo.By("Deleting JobSet", func() {
1607-
utils.ExpectObjectToBeDeleted(ctx, k8sClient, jobSet, true)
1608-
})
1609-
1610-
ginkgo.By("Checking that Slice is deleted", func() {
1611-
utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdSlice, false)
1612-
})
1613-
1614-
ginkgo.By("Checking that Workload is deleted", func() {
1615-
utils.ExpectObjectToBeDeleted(ctx, k8sClient, createdWorkload, false)
1616-
})
1617-
})
1618-
16191481
ginkgo.It("should handle mixed tolerance for degraded slices across multiple PodSets", func() {
16201482
jobSet := testingjobsjobset.MakeJobSet("jobset", ns.Name).
16211483
Queue(lq.Name).

0 commit comments

Comments
 (0)