Skip to content

Commit 8e8ba14

Browse files
authored
[slice] Handle long-named slices after slice-controller upgrade (#1160)
* handle long and short named slices for upgrade scenario * add test case for long name wl reconcile when fg is enabled
1 parent 6084fb3 commit 8e8ba14

4 files changed

Lines changed: 166 additions & 7 deletions

File tree

slice/internal/controller/workload_controller.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -611,15 +611,14 @@ func (r *WorkloadReconciler) syncSlicesForAssignment(ctx context.Context, wl *ku
611611
var deletedSlices []string
612612

613613
for i := range desiredNumberOfSlices {
614-
sliceName := core.SliceName(wl.Namespace, wl.Name, psa.Name, i)
615614
start := i * chunkSize
616615
end := start + chunkSize
617616
var expectedPartitionIDs []string
618617
if len(parsedAssignment.PartitionIDs) > 0 {
619618
expectedPartitionIDs = parsedAssignment.PartitionIDs[start:end]
620619
}
621620

622-
if existingSlice, exist := existingSlicesByName[sliceName]; exist {
621+
if existingSlice, exist := core.FindExistingSlice(existingSlicesByName, wl.Namespace, wl.Name, psa.Name, i); exist {
623622
if !slices.Equal(existingSlice.Spec.PartitionIds, expectedPartitionIDs) {
624623
if existingSlice.DeletionTimestamp.IsZero() {
625624
log := ctrl.LoggerFrom(ctx).WithValues("slice", klog.KObj(existingSlice))

slice/internal/controller/workload_controller_test.go

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2283,6 +2283,85 @@ func TestWorkloadReconciler(t *testing.T) {
22832283
},
22842284
wantResult: reconcile.Result{RequeueAfter: initializationRetryAfter},
22852285
},
2286+
"should adopt existing long-named slice when ShorterSliceNameLength feature gate enabled": {
2287+
enableShorterSliceNameLength: true,
2288+
request: types.NamespacedName{Name: "very-long-workload-name-exceeding-limit-for-testing", Namespace: corev1.NamespaceDefault},
2289+
objs: []client.Object{
2290+
worker1Node.DeepCopy(),
2291+
worker2Node.DeepCopy(),
2292+
baseAdmissionCheckWrapper.DeepCopy(),
2293+
utiltesting.MakeWorkload("very-long-workload-name-exceeding-limit-for-testing", corev1.NamespaceDefault).
2294+
UID("very-long-workload-name-exceeding-limit-for-testing").
2295+
PodSets(basePodSets...).
2296+
ReserveQuota(baseAdmission, now).
2297+
ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName).
2298+
Finalizers(SliceControllerName).
2299+
AdmissionCheck(buildAdmissionCheckState(kueue.CheckStatePending, "")).
2300+
Obj(),
2301+
baseJobSetWrapper.Clone().Obj(),
2302+
// Existing Slices with LONG names
2303+
utiltesting.MakeSliceWrapper(core.SliceNameWithMaxLen("default", "very-long-workload-name-exceeding-limit-for-testing", "ps1", 0, 54)).
2304+
Type(slice.TypeTpu7x).
2305+
Topology("4x4x4").
2306+
OwnerWorkloadAnnotations("default", "very-long-workload-name-exceeding-limit-for-testing").
2307+
PartitionIDs("subblock1").
2308+
Active().
2309+
Obj(),
2310+
utiltesting.MakeSliceWrapper(core.SliceNameWithMaxLen("default", "very-long-workload-name-exceeding-limit-for-testing", "ps2", 0, 54)).
2311+
Type(slice.TypeTpu7x).
2312+
Topology("4x4x4").
2313+
OwnerWorkloadAnnotations("default", "very-long-workload-name-exceeding-limit-for-testing").
2314+
PartitionIDs("subblock2").
2315+
Active().
2316+
Obj(),
2317+
},
2318+
wantWorkloads: []kueue.Workload{
2319+
*utiltesting.MakeWorkload("very-long-workload-name-exceeding-limit-for-testing", corev1.NamespaceDefault).
2320+
UID("very-long-workload-name-exceeding-limit-for-testing").
2321+
PodSets(basePodSets...).
2322+
ReserveQuota(baseAdmission, now).
2323+
ControllerReference(jobSetGVK, baseJobSetName, baseJobSetName).
2324+
Finalizers(SliceControllerName).
2325+
AdmissionCheck(buildAdmissionCheckStateWithPodSetUpdates(kueue.CheckStateReady,
2326+
`Slices are in states: 2 ACTIVE`,
2327+
[]kueue.PodSetUpdate{
2328+
{
2329+
Name: "ps1",
2330+
NodeSelector: map[string]string{"cloud.google.com/gke-tpu-topology": "4x4x4"},
2331+
},
2332+
{
2333+
Name: "ps2",
2334+
NodeSelector: map[string]string{"cloud.google.com/gke-tpu-topology": "4x4x4"},
2335+
},
2336+
})).
2337+
Obj(),
2338+
},
2339+
wantSlices: []slice.Slice{
2340+
*utiltesting.MakeSliceWrapper(core.SliceNameWithMaxLen("default", "very-long-workload-name-exceeding-limit-for-testing", "ps2", 0, 54)).
2341+
Type(slice.TypeTpu7x).
2342+
Topology("4x4x4").
2343+
OwnerWorkloadAnnotations("default", "very-long-workload-name-exceeding-limit-for-testing").
2344+
PartitionIDs("subblock2").
2345+
Active().
2346+
Obj(),
2347+
*utiltesting.MakeSliceWrapper(core.SliceNameWithMaxLen("default", "very-long-workload-name-exceeding-limit-for-testing", "ps1", 0, 54)).
2348+
Type(slice.TypeTpu7x).
2349+
Topology("4x4x4").
2350+
OwnerWorkloadAnnotations("default", "very-long-workload-name-exceeding-limit-for-testing").
2351+
PartitionIDs("subblock1").
2352+
Active().
2353+
Obj(),
2354+
},
2355+
wantJobSets: []jobset.JobSet{*baseJobSetWrapper.Clone().Obj()},
2356+
wantEvents: []utiltesting.EventRecord{
2357+
{
2358+
Key: client.ObjectKey{Namespace: corev1.NamespaceDefault, Name: "very-long-workload-name-exceeding-limit-for-testing"},
2359+
EventType: corev1.EventTypeNormal,
2360+
Reason: AdmissionCheckUpdatedEventType,
2361+
Message: fmt.Sprintf(`Admission check %q updated state from "Pending" to "Ready"`, baseACName),
2362+
},
2363+
},
2364+
},
22862365
}
22872366
for name, tc := range testCases {
22882367
t.Run(name, func(t *testing.T) {

slice/internal/core/slice.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,18 +54,37 @@ func SliceWithMetadata(wl *kueue.Workload, podSetName kueue.PodSetReference, sli
5454
}
5555
}
5656

57-
func SliceName(ns string, workloadName string, podSetName kueue.PodSetReference, sliceIndex int32) string {
57+
func SliceNameWithMaxLen(ns string, workloadName string, podSetName kueue.PodSetReference, sliceIndex int32, maxLen int) string {
5858
name := fmt.Sprintf("%s-%s-%s-%d", ns, workloadName, podSetName, sliceIndex)
59+
if len(name) <= maxLen {
60+
return name
61+
}
62+
hash := sha256.Sum256([]byte(name))
63+
return fmt.Sprintf("%s-%s", name[:maxLen-(hashLength+1)], hex.EncodeToString(hash[:])[:hashLength])
64+
}
65+
66+
func SliceName(ns string, workloadName string, podSetName kueue.PodSetReference, sliceIndex int32) string {
5967
maxLen := maxSliceNameLength
6068
if features.Enabled(features.ShorterSliceNameLength) {
6169
maxLen = maxShorterSliceNameLength
6270
}
71+
return SliceNameWithMaxLen(ns, workloadName, podSetName, sliceIndex, maxLen)
72+
}
6373

64-
if len(name) <= maxLen {
65-
return name
74+
func FindExistingSlice(existingSlicesByName map[string]*v1beta1.Slice, ns string, wlName string, podSetName kueue.PodSetReference, index int32) (*v1beta1.Slice, bool) {
75+
currentName := SliceName(ns, wlName, podSetName, index)
76+
if slice, exist := existingSlicesByName[currentName]; exist {
77+
return slice, true
6678
}
67-
hash := sha256.Sum256([]byte(name))
68-
return fmt.Sprintf("%s-%s", name[:maxLen-(hashLength+1)], hex.EncodeToString(hash[:])[:hashLength])
79+
if features.Enabled(features.ShorterSliceNameLength) {
80+
// Check the long name format to support upgrades
81+
// if long-named Slices already exist in the cluster.
82+
longName := SliceNameWithMaxLen(ns, wlName, podSetName, index, maxSliceNameLength)
83+
if slice, exist := existingSlicesByName[longName]; exist {
84+
return slice, true
85+
}
86+
}
87+
return nil, false
6988
}
7089

7190
func isStale(slice *v1beta1.Slice, timeout time.Duration) bool {

slice/internal/core/slice_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ package core
1919
import (
2020
"testing"
2121

22+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2223
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta2"
2324

25+
"tpu-slice-controller/api/v1beta1"
2426
"tpu-slice-controller/internal/features"
2527
)
2628

@@ -96,3 +98,63 @@ func TestSliceName(t *testing.T) {
9698
})
9799
}
98100
}
101+
102+
func TestFindExistingSlice(t *testing.T) {
103+
ns := "ns"
104+
wlName := "very-long-workload-name-exceeding-the-limit-for-testing"
105+
podSet := kueue.PodSetReference("ps")
106+
var index int32 = 0
107+
108+
longName := SliceNameWithMaxLen(ns, wlName, podSet, index, maxSliceNameLength)
109+
shortName := SliceNameWithMaxLen(ns, wlName, podSet, index, maxShorterSliceNameLength)
110+
111+
testCases := map[string]struct {
112+
gateEnabled bool
113+
m map[string]*v1beta1.Slice
114+
wantFound bool
115+
wantSlice string
116+
}{
117+
"gate disabled, found long name": {
118+
gateEnabled: false,
119+
m: map[string]*v1beta1.Slice{
120+
longName: {ObjectMeta: metav1.ObjectMeta{Name: longName}},
121+
},
122+
wantFound: true,
123+
wantSlice: longName,
124+
},
125+
"gate enabled, found short name": {
126+
gateEnabled: true,
127+
m: map[string]*v1beta1.Slice{
128+
shortName: {ObjectMeta: metav1.ObjectMeta{Name: shortName}},
129+
},
130+
wantFound: true,
131+
wantSlice: shortName,
132+
},
133+
"gate enabled, found long name (fallback)": {
134+
gateEnabled: true,
135+
m: map[string]*v1beta1.Slice{
136+
longName: {ObjectMeta: metav1.ObjectMeta{Name: longName}},
137+
},
138+
wantFound: true,
139+
wantSlice: longName,
140+
},
141+
"gate enabled, not found": {
142+
gateEnabled: true,
143+
m: map[string]*v1beta1.Slice{},
144+
wantFound: false,
145+
},
146+
}
147+
148+
for name, tc := range testCases {
149+
t.Run(name, func(t *testing.T) {
150+
features.SetFeatureGateDuringTest(t, features.ShorterSliceNameLength, tc.gateEnabled)
151+
got, found := FindExistingSlice(tc.m, ns, wlName, podSet, index)
152+
if found != tc.wantFound {
153+
t.Errorf("FindExistingSlice() found = %v, want %v", found, tc.wantFound)
154+
}
155+
if tc.wantFound && got.Name != tc.wantSlice {
156+
t.Errorf("FindExistingSlice() got name = %q, want %q", got.Name, tc.wantSlice)
157+
}
158+
})
159+
}
160+
}

0 commit comments

Comments
 (0)