From 38769a1680fc85650fcb2d4d762f9859c4e9a330 Mon Sep 17 00:00:00 2001 From: vrutz Date: Thu, 25 Jun 2026 15:58:35 +0200 Subject: [PATCH 1/3] Update autoscaler image versions and handle 'latest' Kubernetes version in autoscaler configuration --- python-lib/dku_kube/autoscaler.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python-lib/dku_kube/autoscaler.py b/python-lib/dku_kube/autoscaler.py index 096ca63..4b61e64 100644 --- a/python-lib/dku_kube/autoscaler.py +++ b/python-lib/dku_kube/autoscaler.py @@ -13,7 +13,14 @@ "1.25": "v1.25.3", "1.26": "v1.26.4", "1.27": "v1.27.3", - "1.28": "v1.28.0" + "1.28": "v1.28.0", + "1.29": "v1.29.5", + "1.30": "v1.30.7", + "1.31": "v1.31.5", + "1.32": "v1.32.7", + "1.33": "v1.33.4", + "1.34": "v1.34.3", + "1.35": "v1.35.1" } # fmt: on @@ -30,7 +37,7 @@ def has_autoscaler(kube_config_path): def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_config_path, taints, autoscaler_registry_url): if not has_autoscaler(kube_config_path): kubernetes_version = cluster_config.get("k8sVersion", None) - if _is_none_or_blank(kubernetes_version): + if _is_none_or_blank(kubernetes_version) or kubernetes_version == "latest": kubernetes_version = cluster_def.get("Version") kubernetes_version = strip_kubernetes_version(kubernetes_version) @@ -39,7 +46,7 @@ def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_confi if float(kubernetes_version) < 1.24: autoscaler_image = AUTOSCALER_IMAGES.get("1.24", "v1.24.3") else: - autoscaler_image = AUTOSCALER_IMAGES.get(kubernetes_version, "v1.28.0") + autoscaler_image = AUTOSCALER_IMAGES.get(kubernetes_version, "v1.35.1") autoscaler_full_config = list(yaml.safe_load_all(get_autoscaler_roles())) autoscaler_config = yaml.safe_load(get_autoscaler_config(cluster_id, autoscaler_image, autoscaler_registry_url)) From 7c3a785c016c5e5fbd19d85a94cc55603ab79395 Mon Sep 17 00:00:00 2001 From: vrutz Date: Fri, 26 Jun 2026 12:42:17 +0200 Subject: [PATCH 2/3] Automatically discover autoscaler images versions matching Kubernetes version + add override for airgapped envs --- .../create-eks-cluster/cluster.json | 7 + python-lib/dku_kube/autoscaler.py | 149 +++++++++++++++--- python-lib/dku_utils/tools_version.py | 32 +++- python-runnables/add-autoscaler/runnable.json | 7 + python-runnables/add-autoscaler/runnable.py | 8 +- python-runnables/add-node-pool/runnable.json | 7 + 6 files changed, 177 insertions(+), 33 deletions(-) diff --git a/python-clusters/create-eks-cluster/cluster.json b/python-clusters/create-eks-cluster/cluster.json index beed39e..fa139a8 100644 --- a/python-clusters/create-eks-cluster/cluster.json +++ b/python-clusters/create-eks-cluster/cluster.json @@ -106,6 +106,13 @@ "description": "If registry.k8s.io isn't reachable", "defaultValue" : "registry.k8s.io" }, + { + "name": "autoscalerImageTagOverride", + "label": "Autoscaler image tag override", + "type": "STRING", + "description": "For airgapped registries or registries that block tag listing. Leave empty to discover a public-style version tag.", + "mandatory": false + }, { "name": "advanced", "label": "Use Advanced Configuration", diff --git a/python-lib/dku_kube/autoscaler.py b/python-lib/dku_kube/autoscaler.py index 4b61e64..742839f 100644 --- a/python-lib/dku_kube/autoscaler.py +++ b/python-lib/dku_kube/autoscaler.py @@ -1,28 +1,27 @@ import os import json import logging +import re +import urllib.parse +import requests import yaml from .kubectl_command import run_with_timeout from dku_utils.access import _is_none_or_blank -from dku_utils.tools_version import strip_kubernetes_version +from dku_utils.tools_version import parse_kubernetes_version, strip_kubernetes_version from dku_utils.taints import Toleration -# fmt: off -AUTOSCALER_IMAGES = { - "1.24": "v1.24.3", - "1.25": "v1.25.3", - "1.26": "v1.26.4", - "1.27": "v1.27.3", - "1.28": "v1.28.0", - "1.29": "v1.29.5", - "1.30": "v1.30.7", - "1.31": "v1.31.5", - "1.32": "v1.32.7", - "1.33": "v1.33.4", - "1.34": "v1.34.3", - "1.35": "v1.35.1" -} -# fmt: on +AUTOSCALER_IMAGE_REPOSITORY = "autoscaling/cluster-autoscaler" +AUTOSCALER_TAG_RE = re.compile(r"^v([0-9]+)\.([0-9]+)\.([0-9]+)$") +# Upstream Cluster Autoscaler compatibility guidance says autoscaler and +# Kubernetes major/minor versions match from Kubernetes 1.12 onward. +AUTOSCALER_MATCHING_VERSION_CUTOFF = (1, 12) +AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR = "1.12" + + +class AutoscalerImageSelection(object): + def __init__(self, image_tag, warning=None): + self.image_tag = image_tag + self.warning = warning def has_autoscaler(kube_config_path): @@ -34,19 +33,121 @@ def has_autoscaler(kube_config_path): return len(out.strip()) > 0 +def _parse_autoscaler_tag(tag): + match = AUTOSCALER_TAG_RE.match(tag) + if match is None: + return None + return tuple(int(part) for part in match.groups()) + + +def _get_registry_tags_url(autoscaler_registry_url): + registry_url = autoscaler_registry_url.rstrip("/") + parsed_registry_url = urllib.parse.urlparse(registry_url) + if parsed_registry_url.scheme == "": + parsed_registry_url = urllib.parse.urlparse("https://%s" % registry_url) + + repository_path = "/".join(path_part.strip("/") for path_part in [parsed_registry_url.path, AUTOSCALER_IMAGE_REPOSITORY] if path_part.strip("/")) + return "%s://%s/v2/%s/tags/list" % (parsed_registry_url.scheme, parsed_registry_url.netloc, repository_path) + + +def _discover_published_autoscaler_tags(autoscaler_registry_url): + tags_url = _get_registry_tags_url(autoscaler_registry_url) + logging.info("Retrieving published cluster autoscaler image tags from %s" % tags_url) + + response = requests.get(tags_url, headers={"Accept": "application/json", "User-Agent": "DSS EKS Plugin"}, timeout=10) + if not response.ok: + logging.warning( + "Retrieving the cluster autoscaler image tags from URL '%s' failed with status: %s %s" % (tags_url, response.status_code, response.reason) + ) + logging.warning("Content of failed request: %s" % response.content) + response.raise_for_status() + + tags_response = response.json() + tags = tags_response.get("tags", []) + return [tag for tag in tags if _parse_autoscaler_tag(tag) is not None] + + +def _latest_autoscaler_tag_for_minor(tags, kubernetes_minor=None): + matching_tags = [] + for tag in tags: + parsed_tag = _parse_autoscaler_tag(tag) + if parsed_tag is None: + continue + if kubernetes_minor is not None and "%s.%s" % (parsed_tag[0], parsed_tag[1]) != kubernetes_minor: + continue + matching_tags.append((parsed_tag, tag)) + + if not matching_tags: + return None + + matching_tags.sort() + return matching_tags[-1][1] + + +def select_autoscaler_image(kubernetes_version, autoscaler_registry_url, autoscaler_image_tag_override=None): + kubernetes_major, kubernetes_minor, kubernetes_minor_string = parse_kubernetes_version(kubernetes_version) + parsed_kubernetes_minor = (kubernetes_major, kubernetes_minor) + + if not _is_none_or_blank(autoscaler_image_tag_override): + autoscaler_image_tag_override = autoscaler_image_tag_override.strip() + logging.info( + "Using configured cluster autoscaler image tag override %s for Kubernetes %s" % (autoscaler_image_tag_override, kubernetes_minor_string) + ) + return AutoscalerImageSelection(autoscaler_image_tag_override) + + try: + published_tags = _discover_published_autoscaler_tags(autoscaler_registry_url) + if parsed_kubernetes_minor >= AUTOSCALER_MATCHING_VERSION_CUTOFF: + # From Kubernetes 1.12 onward, upstream expects the autoscaler major/minor + # to match the Kubernetes major/minor; choose the latest patch for that minor. + matching_tag = _latest_autoscaler_tag_for_minor(published_tags, kubernetes_minor_string) + if matching_tag is not None: + return AutoscalerImageSelection(matching_tag) + else: + # Before Kubernetes 1.12 there is no same-minor compatibility rule to apply. + # Use the first version where that rule exists, rather than jumping to latest. + matching_tag = _latest_autoscaler_tag_for_minor(published_tags, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR) + if matching_tag is not None: + warning = ( + "Kubernetes %s is below the cluster autoscaler version-matching cutoff 1.12. Using latest published %s tag %s from registry %s." + ) % (kubernetes_minor_string, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, matching_tag, autoscaler_registry_url) + logging.warning(warning) + return AutoscalerImageSelection(matching_tag, warning) + + latest_tag = _latest_autoscaler_tag_for_minor(published_tags) + if latest_tag is not None: + if parsed_kubernetes_minor < AUTOSCALER_MATCHING_VERSION_CUTOFF: + warning = ( + "Kubernetes %s is below the cluster autoscaler version-matching cutoff 1.12, " + "but no published %s tag exists in registry %s. Using latest published tag %s instead." + ) % (kubernetes_minor_string, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, autoscaler_registry_url, latest_tag) + else: + warning = ( + "No published cluster autoscaler image tag matches Kubernetes %s in registry %s. " + "Using latest published tag %s instead; this may be unsupported by Kubernetes/AWS compatibility guidance." + ) % (kubernetes_minor_string, autoscaler_registry_url, latest_tag) + logging.warning(warning) + return AutoscalerImageSelection(latest_tag, warning) + except (requests.RequestException, ValueError, KeyError) as e: + raise Exception("Unable to retrieve published cluster autoscaler image tags from registry %s: %s" % (autoscaler_registry_url, e)) + + raise Exception("No published cluster autoscaler image tags found in registry %s" % autoscaler_registry_url) + + def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_config_path, taints, autoscaler_registry_url): if not has_autoscaler(kube_config_path): kubernetes_version = cluster_config.get("k8sVersion", None) - if _is_none_or_blank(kubernetes_version) or kubernetes_version == "latest": + if _is_none_or_blank(kubernetes_version) or kubernetes_version.strip().lower() == "latest": kubernetes_version = cluster_def.get("Version") + if _is_none_or_blank(kubernetes_version): + raise Exception("No Kubernetes version found in cluster config or EKS cluster definition") kubernetes_version = strip_kubernetes_version(kubernetes_version) autoscaler_file_path = "autoscaler.yaml" - if float(kubernetes_version) < 1.24: - autoscaler_image = AUTOSCALER_IMAGES.get("1.24", "v1.24.3") - else: - autoscaler_image = AUTOSCALER_IMAGES.get(kubernetes_version, "v1.35.1") + autoscaler_image_tag_override = cluster_config.get("autoscalerImageTagOverride", None) + autoscaler_image_selection = select_autoscaler_image(kubernetes_version, autoscaler_registry_url, autoscaler_image_tag_override) + autoscaler_image = autoscaler_image_selection.image_tag autoscaler_full_config = list(yaml.safe_load_all(get_autoscaler_roles())) autoscaler_config = yaml.safe_load(get_autoscaler_config(cluster_id, autoscaler_image, autoscaler_registry_url)) @@ -72,6 +173,9 @@ def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_confi cmd = ["kubectl", "create", "-f", os.path.abspath(autoscaler_file_path)] logging.info("Create autoscaler with : %s" % json.dumps(cmd)) run_with_timeout(cmd, env=env, timeout=5) + return autoscaler_image_selection + + return None def get_autoscaler_roles(): @@ -247,4 +351,3 @@ def get_autoscaler_config(cluster_id, autoscaler_image_version, autoscaler_regis hostPath: path: "/etc/ssl/certs/ca-bundle.crt" """ % {"autoscalerimageversion": autoscaler_image_version, "clusterid": cluster_id, "autoscalerregistryurl": autoscaler_registry_url} - diff --git a/python-lib/dku_utils/tools_version.py b/python-lib/dku_utils/tools_version.py index fd7f78f..4b939f7 100644 --- a/python-lib/dku_utils/tools_version.py +++ b/python-lib/dku_utils/tools_version.py @@ -2,6 +2,8 @@ import re from dku_kube.kubectl_command import run_with_timeout +KUBERNETES_VERSION_RE = re.compile(r"^([0-9]+)\.([0-9]+)(?:\.([0-9]+))?$") + def get_kubectl_version(): cmd = ["kubectl", "version", "--client", "-o", "json"] @@ -23,29 +25,43 @@ def get_kubectl_version_int(kubectl_version): Extracts the integers representing the major version and the minor version coming from outcome of `kubectl version` command """ - # the kubectl version downloaded from Amazon website has a minor version finishing by '+' - # keeping only the first numeric sequence for the minor version if "major" not in kubectl_version or "minor" not in kubectl_version: raise Exception("Kubectl version found on the machine: %s. It is not correctly formatted" % kubectl_version_to_string(kubectl_version)) - regex_minor_int = re.compile("^[^0-9]*([0-9]+)([^0-9].*$|$)") - search_results_minor_int = re.search(regex_minor_int, kubectl_version["minor"]) - if not search_results_minor_int or not search_results_minor_int.groups(): + + # The kubectl version downloaded from Amazon can have a minor version ending with '+'; + # normalize it before using the generic Kubernetes major/minor parser. + normalized_version = strip_kubernetes_version(kubectl_version_to_string(kubectl_version)) + try: + major_int, minor_int, _ = parse_kubernetes_version(normalized_version) + except Exception: raise Exception("Kubectl version found on the machine: %s. It was not possible to parse" % kubectl_version_to_string(kubectl_version)) - minor_int = int(search_results_minor_int.groups()[0]) - return int(kubectl_version["major"]), minor_int + return major_int, minor_int def strip_kubernetes_version(k8s_version_input): """ Removes any additional characters from the Kubernetes version specified in the cluster creation form """ - regex_k8s_version = re.compile("^[^0-9]*([0-9]+\.?[0-9]+)([^0-9].*$|$)") + regex_k8s_version = re.compile(r"^[^0-9]*([0-9]+\.?[0-9]+)([^0-9].*$|$)") search_results_k8s_version = re.search(regex_k8s_version, k8s_version_input) if not search_results_k8s_version or not search_results_k8s_version.groups(): raise Exception("Kubectl version specified: %s. No valid Kubernetes version found", k8s_version_input) return search_results_k8s_version.groups()[0] +def parse_kubernetes_version(version): + """ + Parses a Kubernetes version string and returns its major, minor, and major.minor string. + EKS cluster versions are usually major.minor, but a patch version is accepted too. + """ + match = KUBERNETES_VERSION_RE.match(version) + if match is None: + raise Exception("Kubernetes version specified: %s. No valid Kubernetes major/minor version found" % version) + major = int(match.group(1)) + minor = int(match.group(2)) + return major, minor, "%s.%s" % (major, minor) + + def get_authenticator_version(): cmd = ["aws-iam-authenticator", "version", "-o", "json"] out, err = run_with_timeout(cmd, timeout=30) diff --git a/python-runnables/add-autoscaler/runnable.json b/python-runnables/add-autoscaler/runnable.json index 3109d74..9cc1fcc 100644 --- a/python-runnables/add-autoscaler/runnable.json +++ b/python-runnables/add-autoscaler/runnable.json @@ -32,6 +32,13 @@ "type": "STRING", "description": "If registry.k8s.io isn't reachable", "defaultValue" : "registry.k8s.io" + }, + { + "name": "autoscalerImageTagOverride", + "label": "Autoscaler image tag override", + "type": "STRING", + "description": "For airgapped registries or registries that block tag listing. Leave empty to discover a public-style version tag.", + "mandatory": false } ] } diff --git a/python-runnables/add-autoscaler/runnable.py b/python-runnables/add-autoscaler/runnable.py index 3dd260c..30d49f4 100644 --- a/python-runnables/add-autoscaler/runnable.py +++ b/python-runnables/add-autoscaler/runnable.py @@ -1,3 +1,5 @@ +import html + from dataiku.runnables import Runnable from dku_kube.autoscaler import add_autoscaler_if_needed, has_autoscaler from dku_utils.cluster import get_cluster_from_dss_cluster @@ -30,5 +32,7 @@ def run(self, progress_callback): return "
An autoscaler pod already runs
" else: autoscaler_registry_url = self.config.get("autoscalerRegistryURL", "registry.k8s.io") - add_autoscaler_if_needed(cluster_id, self.config, cluster_def, kube_config_path, [], autoscaler_registry_url) - return "
Created an autoscaler pod
" + autoscaler_image_selection = add_autoscaler_if_needed(cluster_id, self.config, cluster_def, kube_config_path, [], autoscaler_registry_url) + if autoscaler_image_selection is not None and autoscaler_image_selection.warning is not None: + return '
Created an autoscaler pod
%s
' % html.escape(autoscaler_image_selection.warning) + return "
Created an autoscaler pod
" diff --git a/python-runnables/add-node-pool/runnable.json b/python-runnables/add-node-pool/runnable.json index 8d79f7b..5010b1c 100644 --- a/python-runnables/add-node-pool/runnable.json +++ b/python-runnables/add-node-pool/runnable.json @@ -46,6 +46,13 @@ "type": "STRING", "description": "If registry.k8s.io isn't reachable", "defaultValue" : "registry.k8s.io" + }, + { + "name": "autoscalerImageTagOverride", + "label": "Autoscaler image tag override", + "type": "STRING", + "description": "For airgapped registries or registries that block tag listing. Leave empty to discover a public-style version tag.", + "mandatory": false } ] } From 003e3d2cd426355da391b879e8f22b8a9c7dd9c0 Mon Sep 17 00:00:00 2001 From: vrutz Date: Fri, 26 Jun 2026 16:03:50 +0200 Subject: [PATCH 3/3] add fallback to prevent forcing usage of the autoscaler image tag override --- python-clusters/create-eks-cluster/cluster.py | 4 +- python-lib/dku_kube/autoscaler.py | 148 +++++++++++------- python-runnables/add-autoscaler/runnable.py | 6 +- python-runnables/add-node-pool/runnable.py | 4 +- 4 files changed, 96 insertions(+), 66 deletions(-) diff --git a/python-clusters/create-eks-cluster/cluster.py b/python-clusters/create-eks-cluster/cluster.py index 029d670..ad8e54f 100644 --- a/python-clusters/create-eks-cluster/cluster.py +++ b/python-clusters/create-eks-cluster/cluster.py @@ -295,9 +295,7 @@ def add_vm_to_sg(): logging.info("At least one node group is autoscaling, ensuring autoscaler") autoscaled_taints = list(autoscaled_node_pools_taints) if autoscaled_node_pools_taints else [] autoscaler_registry_url = self.config.get("autoscalerRegistryURL", "registry.k8s.io") - add_autoscaler_if_needed( - self.cluster_id, self.config, cluster_info, kube_config_path, autoscaled_taints, autoscaler_registry_url - ) + add_autoscaler_if_needed(self.cluster_id, self.config, cluster_info, kube_config_path, autoscaled_taints, autoscaler_registry_url) with open(kube_config_path, "r") as f: kube_config = yaml.safe_load(f) diff --git a/python-lib/dku_kube/autoscaler.py b/python-lib/dku_kube/autoscaler.py index 742839f..2070372 100644 --- a/python-lib/dku_kube/autoscaler.py +++ b/python-lib/dku_kube/autoscaler.py @@ -17,11 +17,25 @@ AUTOSCALER_MATCHING_VERSION_CUTOFF = (1, 12) AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR = "1.12" - -class AutoscalerImageSelection(object): - def __init__(self, image_tag, warning=None): - self.image_tag = image_tag - self.warning = warning +# Used only when registry tag discovery is unavailable, which preserves the +# existing custom-registry workflow for registries that do not expose tags/list. +# Keep values pinned to tags known to exist in registry.k8s.io. +# fmt: off +AUTOSCALER_IMAGE_FALLBACKS = { + "1.24": "v1.24.3", + "1.25": "v1.25.3", + "1.26": "v1.26.4", + "1.27": "v1.27.3", + "1.28": "v1.28.0", + "1.29": "v1.29.5", + "1.30": "v1.30.7", + "1.31": "v1.31.5", + "1.32": "v1.32.7", + "1.33": "v1.33.4", + "1.34": "v1.34.3", + "1.35": "v1.35.0", +} +# fmt: on def has_autoscaler(kube_config_path): @@ -67,71 +81,95 @@ def _discover_published_autoscaler_tags(autoscaler_registry_url): return [tag for tag in tags if _parse_autoscaler_tag(tag) is not None] -def _latest_autoscaler_tag_for_minor(tags, kubernetes_minor=None): +def _select_matching_autoscaler_tag_from_tags(tags, parsed_kubernetes_minor): + if parsed_kubernetes_minor >= AUTOSCALER_MATCHING_VERSION_CUTOFF: + # From Kubernetes 1.12 onward, upstream expects the autoscaler major/minor + # to match the Kubernetes major/minor; choose the latest patch for that minor. + matching_minor = parsed_kubernetes_minor + else: + # Before Kubernetes 1.12 there is no same-minor compatibility rule to apply. + # Use the first version where that rule exists, rather than jumping to latest. + matching_minor = AUTOSCALER_MATCHING_VERSION_CUTOFF + matching_tags = [] for tag in tags: parsed_tag = _parse_autoscaler_tag(tag) - if parsed_tag is None: - continue - if kubernetes_minor is not None and "%s.%s" % (parsed_tag[0], parsed_tag[1]) != kubernetes_minor: - continue - matching_tags.append((parsed_tag, tag)) + if parsed_tag is not None and parsed_tag[:2] == matching_minor: + matching_tags.append((parsed_tag, tag)) - if not matching_tags: - return None - - matching_tags.sort() - return matching_tags[-1][1] + if matching_tags: + matching_tags.sort() + return matching_tags[-1][1] def select_autoscaler_image(kubernetes_version, autoscaler_registry_url, autoscaler_image_tag_override=None): - kubernetes_major, kubernetes_minor, kubernetes_minor_string = parse_kubernetes_version(kubernetes_version) + kubernetes_major, kubernetes_minor, kubernetes_version_string = parse_kubernetes_version(kubernetes_version) parsed_kubernetes_minor = (kubernetes_major, kubernetes_minor) if not _is_none_or_blank(autoscaler_image_tag_override): autoscaler_image_tag_override = autoscaler_image_tag_override.strip() logging.info( - "Using configured cluster autoscaler image tag override %s for Kubernetes %s" % (autoscaler_image_tag_override, kubernetes_minor_string) + "Using configured cluster autoscaler image tag override %s for Kubernetes %s" % (autoscaler_image_tag_override, kubernetes_version_string) ) - return AutoscalerImageSelection(autoscaler_image_tag_override) + return autoscaler_image_tag_override try: published_tags = _discover_published_autoscaler_tags(autoscaler_registry_url) - if parsed_kubernetes_minor >= AUTOSCALER_MATCHING_VERSION_CUTOFF: - # From Kubernetes 1.12 onward, upstream expects the autoscaler major/minor - # to match the Kubernetes major/minor; choose the latest patch for that minor. - matching_tag = _latest_autoscaler_tag_for_minor(published_tags, kubernetes_minor_string) - if matching_tag is not None: - return AutoscalerImageSelection(matching_tag) + selected_tag = _select_matching_autoscaler_tag_from_tags(published_tags, parsed_kubernetes_minor) + + if selected_tag is None: + logging.warning( + "No published cluster autoscaler image tag matches Kubernetes %s in registry %s. Fallback will be used." + % ( + kubernetes_version_string, + autoscaler_registry_url, + ) + ) + elif parsed_kubernetes_minor < AUTOSCALER_MATCHING_VERSION_CUTOFF: + logging.warning( + "Kubernetes %s is below the cluster autoscaler version-matching cutoff %s. Using tag %s from registry %s." + % ( + kubernetes_version_string, + AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, + selected_tag, + autoscaler_registry_url, + ) + ) + return selected_tag else: - # Before Kubernetes 1.12 there is no same-minor compatibility rule to apply. - # Use the first version where that rule exists, rather than jumping to latest. - matching_tag = _latest_autoscaler_tag_for_minor(published_tags, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR) - if matching_tag is not None: - warning = ( - "Kubernetes %s is below the cluster autoscaler version-matching cutoff 1.12. Using latest published %s tag %s from registry %s." - ) % (kubernetes_minor_string, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, matching_tag, autoscaler_registry_url) - logging.warning(warning) - return AutoscalerImageSelection(matching_tag, warning) - - latest_tag = _latest_autoscaler_tag_for_minor(published_tags) - if latest_tag is not None: - if parsed_kubernetes_minor < AUTOSCALER_MATCHING_VERSION_CUTOFF: - warning = ( - "Kubernetes %s is below the cluster autoscaler version-matching cutoff 1.12, " - "but no published %s tag exists in registry %s. Using latest published tag %s instead." - ) % (kubernetes_minor_string, AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, autoscaler_registry_url, latest_tag) - else: - warning = ( - "No published cluster autoscaler image tag matches Kubernetes %s in registry %s. " - "Using latest published tag %s instead; this may be unsupported by Kubernetes/AWS compatibility guidance." - ) % (kubernetes_minor_string, autoscaler_registry_url, latest_tag) - logging.warning(warning) - return AutoscalerImageSelection(latest_tag, warning) + logging.info("Using cluster autoscaler image tag %s for Kubernetes %s" % (selected_tag, kubernetes_version_string)) + return selected_tag except (requests.RequestException, ValueError, KeyError) as e: - raise Exception("Unable to retrieve published cluster autoscaler image tags from registry %s: %s" % (autoscaler_registry_url, e)) + logging.warning( + "Unable to retrieve published cluster autoscaler image tags from registry %s. Fallback will be used.\n %s." % (autoscaler_registry_url, e) + ) - raise Exception("No published cluster autoscaler image tags found in registry %s" % autoscaler_registry_url) + fallback_tags = list(AUTOSCALER_IMAGE_FALLBACKS.values()) + fallback_tag = _select_matching_autoscaler_tag_from_tags(fallback_tags, parsed_kubernetes_minor) + if fallback_tag is not None: + if parsed_kubernetes_minor < AUTOSCALER_MATCHING_VERSION_CUTOFF: + logging.warning( + "Kubernetes %s is below the cluster autoscaler version-matching cutoff %s. Using bundled fallback tag %s." + % ( + kubernetes_version_string, + AUTOSCALER_MATCHING_VERSION_CUTOFF_MINOR, + fallback_tag, + ) + ) + else: + logging.info("Using bundled fallback tag %s for Kubernetes %s." % (fallback_tag, kubernetes_version_string)) + return fallback_tag + + latest_supported_version = sorted(AUTOSCALER_IMAGE_FALLBACKS.keys(), key=lambda version: tuple(int(part) for part in version.split(".")))[-1] + latest_fallback_tag = AUTOSCALER_IMAGE_FALLBACKS[latest_supported_version] + logging.info( + "No bundled fallback matches Kubernetes %s. Using latest bundled fallback tag %s instead." + % ( + kubernetes_version_string, + latest_fallback_tag, + ) + ) + return latest_fallback_tag def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_config_path, taints, autoscaler_registry_url): @@ -146,11 +184,10 @@ def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_confi autoscaler_file_path = "autoscaler.yaml" autoscaler_image_tag_override = cluster_config.get("autoscalerImageTagOverride", None) - autoscaler_image_selection = select_autoscaler_image(kubernetes_version, autoscaler_registry_url, autoscaler_image_tag_override) - autoscaler_image = autoscaler_image_selection.image_tag + autoscaler_image_tag = select_autoscaler_image(kubernetes_version, autoscaler_registry_url, autoscaler_image_tag_override) autoscaler_full_config = list(yaml.safe_load_all(get_autoscaler_roles())) - autoscaler_config = yaml.safe_load(get_autoscaler_config(cluster_id, autoscaler_image, autoscaler_registry_url)) + autoscaler_config = yaml.safe_load(get_autoscaler_config(cluster_id, autoscaler_image_tag, autoscaler_registry_url)) tolerations = set() # If there are any taints to patch the autoscaler with in the node group(s) to create, @@ -173,9 +210,6 @@ def add_autoscaler_if_needed(cluster_id, cluster_config, cluster_def, kube_confi cmd = ["kubectl", "create", "-f", os.path.abspath(autoscaler_file_path)] logging.info("Create autoscaler with : %s" % json.dumps(cmd)) run_with_timeout(cmd, env=env, timeout=5) - return autoscaler_image_selection - - return None def get_autoscaler_roles(): diff --git a/python-runnables/add-autoscaler/runnable.py b/python-runnables/add-autoscaler/runnable.py index 30d49f4..511d0b6 100644 --- a/python-runnables/add-autoscaler/runnable.py +++ b/python-runnables/add-autoscaler/runnable.py @@ -1,5 +1,3 @@ -import html - from dataiku.runnables import Runnable from dku_kube.autoscaler import add_autoscaler_if_needed, has_autoscaler from dku_utils.cluster import get_cluster_from_dss_cluster @@ -32,7 +30,5 @@ def run(self, progress_callback): return "
An autoscaler pod already runs
" else: autoscaler_registry_url = self.config.get("autoscalerRegistryURL", "registry.k8s.io") - autoscaler_image_selection = add_autoscaler_if_needed(cluster_id, self.config, cluster_def, kube_config_path, [], autoscaler_registry_url) - if autoscaler_image_selection is not None and autoscaler_image_selection.warning is not None: - return '
Created an autoscaler pod
%s
' % html.escape(autoscaler_image_selection.warning) + add_autoscaler_if_needed(cluster_id, self.config, cluster_def, kube_config_path, [], autoscaler_registry_url) return "
Created an autoscaler pod
" diff --git a/python-runnables/add-node-pool/runnable.py b/python-runnables/add-node-pool/runnable.py index 9f8288d..539c49f 100644 --- a/python-runnables/add-node-pool/runnable.py +++ b/python-runnables/add-node-pool/runnable.py @@ -108,7 +108,9 @@ def run(self, progress_callback): if node_pool.get("numNodesAutoscaling", False): logging.info("Nodegroup is autoscaling, ensuring autoscaler") autoscaler_registry_url = self.config.get("autoscalerRegistryURL", "registry.k8s.io") - add_autoscaler_if_needed(cluster_id, self.config, cluster_data.get("cluster"), kube_config_path, node_group_taints, autoscaler_registry_url) + add_autoscaler_if_needed( + cluster_id, self.config, cluster_data.get("cluster"), kube_config_path, node_group_taints, autoscaler_registry_url + ) if node_pool.get("enableGPU", False): logging.info("Nodegroup is GPU-enabled, ensuring NVIDIA GPU Drivers")