Skip to content

Commit a72a752

Browse files
authored
fix: xpk workload list should sum podSet counts (#1106)
Fix wildcard counting logic and priority for super-slicing
1 parent 57c21c6 commit a72a752

2 files changed

Lines changed: 78 additions & 39 deletions

File tree

src/xpk/core/workload.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -115,20 +115,23 @@ def _parse_workload_item(item: dict[str, Any]) -> _WorkloadListRow:
115115
or None
116116
)
117117

118-
tpu_vms_needed = _safe_int(pod_sets[0].get('count')) if pod_sets else None
119-
120-
pod_set_assignments = (
121-
item.get('status', {}).get('admission', {}).get('podSetAssignments') or []
118+
tpu_vms_needed = (
119+
sum(_safe_int(ps.get('count')) for ps in pod_sets) if pod_sets else None
122120
)
121+
122+
admission_status = item.get('status', {}).get('admission', {})
123+
pod_set_assignments = admission_status.get('podSetAssignments') or []
123124
tpu_vms_running_ran = (
124-
_safe_int(pod_set_assignments[0].get('count'))
125+
sum(_safe_int(psa.get('count')) for psa in pod_set_assignments)
125126
if pod_set_assignments
126127
else None
127128
)
128129

129130
reclaimable_pods = item.get('status', {}).get('reclaimablePods') or []
130131
tpu_vms_done = (
131-
_safe_int(reclaimable_pods[0].get('count')) if reclaimable_pods else None
132+
sum(_safe_int(rp.get('count')) for rp in reclaimable_pods)
133+
if reclaimable_pods
134+
else None
132135
)
133136

134137
conditions = item.get('status', {}).get('conditions') or [{}]

src/xpk/core/workload_test.py

Lines changed: 69 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ class _MockWorkloadData:
4646
jobset_name: str
4747
created_time: str
4848
priority: str
49-
needed: int | str
50-
running: int | str
51-
done: int | str
49+
needed: list[int]
50+
running: list[int]
51+
done: list[int]
5252
status: str
5353
message: str
5454
status_time: str
@@ -61,14 +61,19 @@ def _create_mock_workload_json(data: _MockWorkloadData):
6161
'ownerReferences': [{'name': data.jobset_name}],
6262
},
6363
'spec': {
64-
'podSets': [{
65-
'count': data.needed,
66-
'template': {'spec': {'priorityClassName': data.priority}},
67-
}]
64+
'podSets': [
65+
{
66+
'count': v,
67+
'template': {'spec': {'priorityClassName': data.priority}},
68+
}
69+
for v in data.needed
70+
]
6871
},
6972
'status': {
70-
'admission': {'podSetAssignments': [{'count': data.running}]},
71-
'reclaimablePods': [{'count': data.done}],
73+
'admission': {
74+
'podSetAssignments': [{'count': v} for v in data.running]
75+
},
76+
'reclaimablePods': [{'count': v} for v in data.done],
7277
'conditions': [{
7378
'type': data.status,
7479
'message': data.message,
@@ -102,9 +107,9 @@ def test_get_workload_list(commands_tester: CommandsTester):
102107
jobset_name='job-test',
103108
created_time='2024-01-01T00:00:00Z',
104109
priority='high',
105-
needed=32,
106-
running=32,
107-
done=0,
110+
needed=[32],
111+
running=[32],
112+
done=[0],
108113
status='Running',
109114
message='All good',
110115
status_time='2024-01-01T00:01:00Z',
@@ -143,9 +148,9 @@ def test_get_workload_list_filter_by_job(commands_tester: CommandsTester):
143148
jobset_name='job-test-1',
144149
created_time='2024-01-01T00:00:00Z',
145150
priority='high',
146-
needed=32,
147-
running=32,
148-
done=0,
151+
needed=[32],
152+
running=[32],
153+
done=[0],
149154
status='Running',
150155
message='All good',
151156
status_time='2024-01-01T00:01:00Z',
@@ -156,9 +161,9 @@ def test_get_workload_list_filter_by_job(commands_tester: CommandsTester):
156161
jobset_name='job-test-2',
157162
created_time='2024-01-02T00:00:00Z',
158163
priority='low',
159-
needed=4,
160-
running=4,
161-
done=0,
164+
needed=[4],
165+
running=[4],
166+
done=[0],
162167
status='Running',
163168
message='All good',
164169
status_time='2024-01-02T00:01:00Z',
@@ -169,9 +174,9 @@ def test_get_workload_list_filter_by_job(commands_tester: CommandsTester):
169174
jobset_name='other-job',
170175
created_time='2024-01-03T00:00:00Z',
171176
priority='high',
172-
needed=16,
173-
running='',
174-
done=0,
177+
needed=[16],
178+
running=[],
179+
done=[0],
175180
status='Admitted',
176181
message='Waiting',
177182
status_time='2024-01-03T00:01:00Z',
@@ -195,6 +200,37 @@ def test_get_workload_list_filter_by_job(commands_tester: CommandsTester):
195200
assert parsed_table[1]['Jobset Name'] == 'job-test-2'
196201

197202

203+
def test_get_workload_list_multiple_pod_sets(commands_tester: CommandsTester):
204+
mock_data = _MockWorkloadData(
205+
jobset_name='multi-podset-job',
206+
created_time='2024-01-01T00:00:00Z',
207+
priority='high',
208+
needed=[16, 32],
209+
running=[16, 32],
210+
done=[16, 32],
211+
status='Running',
212+
message='All good',
213+
status_time='2024-01-01T00:01:00Z',
214+
)
215+
mock_output = json.dumps({'items': [_create_mock_workload_json(mock_data)]})
216+
commands_tester.set_result_for_command(
217+
(0, mock_output), 'kubectl', 'get', 'workloads'
218+
)
219+
args = MagicMock()
220+
args.filter_by_status = 'EVERYTHING'
221+
args.filter_by_job = None
222+
223+
return_code, return_value = get_workload_list(args)
224+
225+
assert return_code == 0
226+
parsed_table = _parse_workload_table(return_value)
227+
assert len(parsed_table) == 1
228+
assert parsed_table[0]['Jobset Name'] == 'multi-podset-job'
229+
assert parsed_table[0]['TPU VMs Needed'] == '48'
230+
assert parsed_table[0]['TPU VMs Running/Ran'] == '48'
231+
assert parsed_table[0]['TPU VMs Done'] == '48'
232+
233+
198234
@pytest.mark.parametrize(
199235
'filter_by_status, expected_job_names',
200236
[
@@ -226,9 +262,9 @@ def test_get_workload_list_filters(
226262
jobset_name='queued-job',
227263
created_time='2024-01-01T00:00:00Z',
228264
priority='high',
229-
needed=4,
230-
running='',
231-
done=0,
265+
needed=[4],
266+
running=[],
267+
done=[0],
232268
status='Admitted',
233269
message='Waiting',
234270
status_time='2024-01-01T00:01:00Z',
@@ -239,9 +275,9 @@ def test_get_workload_list_filters(
239275
jobset_name='running-job',
240276
created_time='2024-01-01T00:00:00Z',
241277
priority='high',
242-
needed=4,
243-
running=4,
244-
done=0,
278+
needed=[4],
279+
running=[4],
280+
done=[0],
245281
status='Admitted',
246282
message='Running',
247283
status_time='2024-01-01T00:01:00Z',
@@ -252,9 +288,9 @@ def test_get_workload_list_filters(
252288
jobset_name='success-job',
253289
created_time='2024-01-01T00:00:00Z',
254290
priority='high',
255-
needed=4,
256-
running=4,
257-
done=4,
291+
needed=[4],
292+
running=[4],
293+
done=[4],
258294
status='Finished',
259295
message='Job finishedsuccessfully',
260296
status_time='2024-01-01T00:01:00Z',
@@ -265,9 +301,9 @@ def test_get_workload_list_filters(
265301
jobset_name='failed-job',
266302
created_time='2024-01-01T00:00:00Z',
267303
priority='high',
268-
needed=4,
269-
running=4,
270-
done=0,
304+
needed=[4],
305+
running=[4],
306+
done=[0],
271307
status='Finished',
272308
message='Job failed witherror',
273309
status_time='2024-01-01T00:01:00Z',

0 commit comments

Comments
 (0)