Skip to content

Commit 65ffde6

Browse files
authored
fix: relax Kueue version check for sub and super-slicing workloads (#1111)
fix: relax Kueue version check for custom image SHAs
1 parent 2fda9b8 commit 65ffde6

4 files changed

Lines changed: 103 additions & 19 deletions

File tree

src/xpk/core/kueue_manager.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
from ..utils.file import write_tmp_file
4141
from ..utils.console import xpk_print, xpk_exit, ask_for_user_consent
4242
from ..utils.templates import TEMPLATE_PATH, get_templates_absolute_path
43-
from packaging.version import Version
43+
from packaging.version import Version, InvalidVersion
4444

4545
KUEUE_VERSION = Version("v0.15.2")
4646
LATEST_BREAKING_VERSION = Version("v0.15.0")
@@ -113,7 +113,13 @@ def install_or_upgrade(
113113
return_code, installed_version = get_installed_kueue_version()
114114

115115
if return_code == 0 and installed_version:
116-
if installed_version > self.kueue_version:
116+
if isinstance(installed_version, str):
117+
xpk_print(
118+
"Cluster has an unknown or custom Kueue version installed. Skipping"
119+
" installation."
120+
)
121+
return 0
122+
elif installed_version > self.kueue_version:
117123
xpk_print(
118124
f"Cluster has a newer Kueue version, {installed_version}. Skipping"
119125
" installation."
@@ -532,7 +538,16 @@ def __autocorrect_resource_limits(
532538

533539
def get_installed_kueue_version(
534540
dry_run_version: Version | None = None,
535-
) -> tuple[int, Version | None]:
541+
) -> tuple[int, Version | str | None]:
542+
"""Gets the currently installed Kueue version from the cluster.
543+
544+
Returns a tuple containing:
545+
- The return code of the kubectl command.
546+
- The version information:
547+
- None if the command fails or the value is empty (e.g. Kueue not installed).
548+
- A Version object if the image tag can be parsed as a valid version.
549+
- A string if the image tag contains a custom SHA or is unparseable.
550+
"""
536551
command = (
537552
"kubectl get deployment kueue-controller-manager -n kueue-system -o"
538553
" jsonpath='{.spec.template.spec.containers[0].image}'"
@@ -547,12 +562,17 @@ def get_installed_kueue_version(
547562
else ""
548563
),
549564
)
550-
if return_code != 0:
565+
if return_code != 0 or not val:
551566
return return_code, None
552-
version_tag = val.split(":")
553-
if len(version_tag) == 1:
554-
return 1, None
555-
return return_code, Version(version_tag[-1])
567+
568+
if "@sha256:" in val or ":" not in val:
569+
return return_code, val
570+
571+
version_tag = val.split(":")[-1]
572+
try:
573+
return return_code, Version(version_tag)
574+
except InvalidVersion:
575+
return return_code, val
556576

557577

558578
def has_sub_slicing_enabled() -> tuple[int, bool | None]:

src/xpk/core/kueue_manager_test.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,16 @@ def set_installed_kueue_version(
6666
)
6767

6868

69+
def set_custom_installed_kueue_version(
70+
commands_tester: CommandsTester, image: str
71+
):
72+
commands_tester.set_result_for_command(
73+
(0, image),
74+
"kubectl get deployment kueue-controller-manager",
75+
"containers[0].image",
76+
)
77+
78+
6979
@pytest.fixture(autouse=True)
7080
def mock_ask_for_user_consent(mocker: MockerFixture) -> MagicMock:
7181
return mocker.patch(
@@ -104,6 +114,21 @@ def test_install_or_upgrade_when_newer_version_already_installed(
104114
mock_commands.assert_command_not_run("kubectl apply")
105115

106116

117+
def test_install_or_upgrade_when_custom_version_already_installed(
118+
mock_commands: CommandsTester, kueue_manager: KueueManager
119+
):
120+
"""Test install_or_upgrade when a custom Kueue build is installed."""
121+
set_custom_installed_kueue_version(
122+
mock_commands,
123+
"us-central1-docker.pkg.dev/dummy-project/kueue/kueue@sha256:1234567890abcdef1234567890abcdef1234567890abcdef1234567890abcdef",
124+
)
125+
126+
result = kueue_manager.install_or_upgrade(KUEUE_CONFIG)
127+
128+
assert result == 0
129+
mock_commands.assert_command_not_run("kubectl apply")
130+
131+
107132
def test_install_or_upgrade_when_outdated(
108133
mock_commands: CommandsTester, kueue_manager: KueueManager
109134
):

src/xpk/core/scheduling.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,26 @@
3535
_SUB_SLICING_MINIMUM_KUEUE_VERSION = Version('0.13.0')
3636
_SUPER_SLICING_MINIMUM_KUEUE_VERSION = Version('0.15.2')
3737
_SUPER_SLICING_MAX_CUBES = 144
38+
39+
40+
def _is_kueue_version_sufficient(
41+
return_code: int,
42+
current_version: Version | str | None,
43+
minimum_version: Version,
44+
) -> bool:
45+
if (
46+
return_code != 0
47+
or current_version is None
48+
or isinstance(current_version, str)
49+
):
50+
xpk_print(
51+
'Warning: Could not determine Kueue version. Proceeding with workload'
52+
' submission, but scheduling might fail if Kueue is outdated.'
53+
)
54+
return True
55+
return current_version >= minimum_version
56+
57+
3858
ONE_TO_ONE_REPLICA_NODE_POOL_ASSIGNMENT_ANNOTATION = (
3959
'alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool'
4060
)
@@ -181,11 +201,8 @@ def _check_sub_slicing_availability(
181201
return_code, current_version = get_installed_kueue_version(
182202
dry_run_version=_SUB_SLICING_MINIMUM_KUEUE_VERSION
183203
)
184-
185-
return (
186-
return_code == 0
187-
and current_version is not None
188-
and current_version >= _SUB_SLICING_MINIMUM_KUEUE_VERSION
204+
return _is_kueue_version_sufficient(
205+
return_code, current_version, _SUB_SLICING_MINIMUM_KUEUE_VERSION
189206
)
190207

191208

@@ -205,11 +222,8 @@ def _check_super_slicing_availability(
205222
return_code, current_version = get_installed_kueue_version(
206223
dry_run_version=_SUPER_SLICING_MINIMUM_KUEUE_VERSION
207224
)
208-
209-
return (
210-
return_code == 0
211-
and current_version is not None
212-
and current_version >= _SUPER_SLICING_MINIMUM_KUEUE_VERSION
225+
return _is_kueue_version_sufficient(
226+
return_code, current_version, _SUPER_SLICING_MINIMUM_KUEUE_VERSION
213227
)
214228

215229

src/xpk/core/scheduling_test.py

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ class SchedulingTestCase:
174174
cluster_system: SystemCharacteristics | None = None
175175
resources_config_map: dict[str, str] | None = None
176176
kueue_version: str | None = None
177+
kueue_version_return_code: int = 0
177178
sub_slicing_feature_enabled: bool = False
178179
sub_slicing_topology_set: bool = False
179180
super_slicing_topology_set: bool = False
@@ -269,6 +270,16 @@ class SchedulingTestCase:
269270
),
270271
WorkloadScheduling.UNAVAILABLE,
271272
),
273+
(
274+
'Sub-slicing, with unparseable Kueue version (SHA)',
275+
dataclasses.replace(SUB_SLICING_CASE, kueue_version='sha256-12345'),
276+
WorkloadScheduling.SUB_SLICING_AVAILABLE,
277+
),
278+
(
279+
'Sub-slicing, with failing Kueue version check',
280+
dataclasses.replace(SUB_SLICING_CASE, kueue_version_return_code=1),
281+
WorkloadScheduling.SUB_SLICING_AVAILABLE,
282+
),
272283
(
273284
'Sub-slicing, but low Kueue version',
274285
dataclasses.replace(SUB_SLICING_CASE, kueue_version='0.12.0'),
@@ -328,6 +339,20 @@ class SchedulingTestCase:
328339
SUPER_SLICING_CASE,
329340
WorkloadScheduling.SUPER_SLICING_AVAILABLE,
330341
),
342+
(
343+
'Super-slicing, with unparseable Kueue version (SHA)',
344+
dataclasses.replace(
345+
SUPER_SLICING_CASE, kueue_version='sha256-12345'
346+
),
347+
WorkloadScheduling.SUPER_SLICING_AVAILABLE,
348+
),
349+
(
350+
'Super-slicing, with failing Kueue version check',
351+
dataclasses.replace(
352+
SUPER_SLICING_CASE, kueue_version_return_code=1
353+
),
354+
WorkloadScheduling.SUPER_SLICING_AVAILABLE,
355+
),
331356
(
332357
'Super-slicing, but low Kueue version',
333358
dataclasses.replace(SUPER_SLICING_CASE, kueue_version='0.13.0'),
@@ -418,7 +443,7 @@ def test_check_if_workload_can_schedule(
418443
FeatureFlags.SUB_SLICING_ENABLED = case.sub_slicing_feature_enabled
419444
commands_tester.set_result_for_command(
420445
(
421-
0,
446+
case.kueue_version_return_code,
422447
f'registry.k8s.io/kueue/kueue:v{case.kueue_version}'
423448
if case.kueue_version
424449
else '',

0 commit comments

Comments
 (0)