diff --git a/.pylintrc b/.pylintrc deleted file mode 100644 index 5bfdc81cb3..0000000000 --- a/.pylintrc +++ /dev/null @@ -1,160 +0,0 @@ -# copybara:strip_begin(internal) -# This is based on http://google3/devtools/gpylint/config/base/rc -# copybara:strip_end - - -# Default configuration for pylint, which should pass for all (incremental) changes. -# See CONTRIBUTING.md for more. - -[MESSAGES CONTROL] -# List of checkers and warnings to enable. -enable=indexing-exception,old-raise-syntax - -disable=abstract-method, - attribute-defined-outside-init, - bad-option-value, - c-extension-no-member, - design, - file-ignored, - fixme, - global-statement, - invalid-metaclass, - locally-disabled, - locally-enabled, - misplaced-comparison-constant, - no-else-break, - no-else-continue, - no-else-raise, - no-else-return, - no-self-use, - pointless-except, - redundant-u-string-prefix, - similarities, - star-args, - suppressed-message, - trailing-newlines, - ungrouped-imports, - unnecessary-pass, - unspecified-encoding, - unsubscriptable-object, - useless-else-on-loop, - useless-object-inheritance, - useless-suppression, - -[BASIC] - -# Regular expression which should only match the name -# of functions or classes which do not require a docstring. -no-docstring-rgx=(__.*__|main) - -# Min length in lines of a function that requires a docstring. -docstring-min-length=12 - -# Regular expression which should only match correct module names. The -# leading underscore is sanctioned for private modules by Google's style -# guide. -# -# There are exceptions to the basic rule (_?[a-z][a-z0-9_]*) to cover -# requirements of Python's module system and of the presubmit framework. -module-rgx=^(_?[a-z][a-z0-9_]*)|__init__|PRESUBMIT|PRESUBMIT_unittest$ - -# Regular expression which should only match correct module level names -const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression which should only match correct class attribute -class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$ - -# Regular expression which should only match correct class names -class-rgx=^_?[A-Z][a-zA-Z0-9]*$ - -# Regular expression which should only match correct function names. -# 'PascalCase' and 'snake_case' group names are used for consistency of naming -# styles across functions and methods. -function-rgx=^(?:(?P_?[A-Z][a-zA-Z0-9]*)|(?P_?[a-z][a-z0-9_]*))$ - -# Regular expression which should only match correct method names. -# 'PascalCase' and 'snake_case' group names are used for consistency of naming -# styles across functions and methods. 'exempt' indicates a name which is -# consistent with all naming styles. -method-rgx=(?x)^(?:(?P_[a-z0-9_]+__|next)|(?P_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P_{0,2}[a-z][a-z0-9_]*))$ - -# Regular expression which should only match correct instance attribute names -attr-rgx=^_{0,2}[a-z][a-z0-9_]*$ - -# Regular expression which should only match correct argument names -argument-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression which should only match correct variable names -variable-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression which should only match correct list comprehension / -# generator expression variable names -inlinevar-rgx=^[a-z][a-z0-9_]*$ - -# Regular expression which should only match correct TypeVar names -typevar-rgx=^_{0,2}(?:[^\W\da-z_]+|(?:[^\W\da-z_]+[^\WA-Z_]+)+T?)(?:_co(?:ntra)?)?$ - -# Good variable names which should always be accepted, separated by a comma -good-names=main,_ - -# List of decorators that define properties, such as abc.abstractproperty. -property-classes=abc.abstractproperty,functools.cached_property,google3.pyglib.function_utils.cached.property,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl,werkzeug.utils.cached_property - -[VARIABLES] - -# Tells whether we should check for unused import in __init__ files. -init-import=no - -# A regular expression matching names used for dummy variables (i.e. not used). -dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_) - -# List of additional names supposed to be defined in builtins. Remember that -# you should avoid to define new builtins when possible. -additional-builtins= - -# List of modules that are allowed to redefine builtins. -redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools - -[STRING] - -# This flag controls whether the implicit-str-concat should -# generate a warning on implicit string concatenation in sequences defined over -# several lines. -check-str-concat-over-line-jumps=yes - -[CLASSES] - -# List of method names used to declare (i.e. assign) instance attributes. -defining-attr-methods=__init__,__new__,setUp - -# "class_" is also a valid for the first argument to a class method. -valid-classmethod-first-arg=cls,class_ - - -[FORMAT] - -# Maximum number of characters on a single line. -max-line-length=80 - -# Regexp for a line that is allowed to be longer than the limit. -# This "ignore" regex is today composed of: -# (1) p4 expansion $Id$ lines -# (2) Depot paths for go/ifthisthenthatlint directives. -# (3) Long string constants not containing whitespaces. This is needed now we -# have switched Pyformat to use Pyink, and it would wrap strings constants -# with a narrow range of lengths (less than 80 - indentation) in parens. -# This causes GPylint to complain otherwise allowed per -# go/pystyle#line-length. See b/262137806 for more information. -# Other lines might be allowed to be long by gpylint.pyformat_filter: see that -# module for more information. -ignore-long-lines=(?x)(\$Id:\s\/\/depot\/.+\#\d+\s\$|^\s*\#\ LINT\.ThenChange|^\s*\w+\ =\ (?P['"])\S+(?P=quote)$) - -# Maximum number of lines in a module -max-module-lines=99999 - -# String used as indentation unit. We differ from PEP8's normal 4 spaces. -indent-string=' ' - -# Do not warn about multiple statements on a single line for constructs like -# if test: stmt -single-line-if-stmt=y diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py new file mode 100644 index 0000000000..dbf07127ae --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py @@ -0,0 +1,792 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Benchmark for Kubernetes management plane operations. + +Measures GKE/EKS/AKS control-plane API responsiveness via three scenarios: + A. Concurrent node-pool create/upgrade/delete. + B. Node-pool create overlapping with a long-running cluster update. + C. Large-scale node-pool provisioning (single scale or sweep). + +Optimizations for minimum run time: + - Streaming concurrency in Scenario C (no batch barriers) + - Optional pipelined Scenario A (create->upgrade->delete per thread) + - Reduced poll_interval in provider WaitForOperation (5s vs 10s) + - Per-op threads capped at _MAX_CONCURRENT to avoid OS limits + - Accurate delete success rate via attempted_ops denominator +""" + +import copy +import dataclasses +import statistics +import threading +import time +from typing import Callable +from unicodedata import name + +from absl import flags +from absl import logging +from perfkitbenchmarker import background_tasks +from perfkitbenchmarker import benchmark_spec as bm_spec +from perfkitbenchmarker import configs +from perfkitbenchmarker import errors +from perfkitbenchmarker import sample +from perfkitbenchmarker.configs import benchmark_config_spec +from perfkitbenchmarker.resources.container_service import ( + container as container_lib) +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.resources.container_service import kubernetes_cluster + +_SLEEP_POD_NAME = 'pkb-mgmt-sleep' + +BENCHMARK_NAME = 'kubernetes_management' + +BENCHMARK_CONFIG = """ +kubernetes_management: + description: > + Benchmarks GKE/EKS/AKS management plane operations: concurrent node pool + create/upgrade/delete, overlapping cluster + node-pool ops, and large-scale + provisioning. Focused on control-plane API responsiveness. + Spec regions: GCP us-central1, AWS us-east-1 (closest), Azure eastus. + Equivalent machine types across clouds per Google benchmark spec. + container_cluster: + type: Kubernetes + vm_count: 1 + vm_spec: + GCP: + # us-central1-a: spec primary region for GCP + # e2-standard-2: 2 vCPU 8GB — equivalent to t3.medium / D2s_v3 + machine_type: e2-standard-2 + zone: us-central1-a + AWS: + # us-east-1a: closest comparable region to GCP us-central1 + # t3.medium: 2 vCPU 4GB — closest equivalent to e2-standard-2 + machine_type: t3.medium + zone: us-east-1a + Azure: + # eastus: closest comparable region to GCP us-central1 + # Standard_D2s_v3: 2 vCPU 8GB — equivalent to e2-standard-2 + machine_type: Standard_D2s_v3 + zone: eastus +""" + +_VALID_SCENARIOS = frozenset({'A', 'B', 'C'}) + +_CONCURRENT_NODEPOOLS = flags.DEFINE_integer( + 'k8s_mgmt_concurrent_nodepools', + 5, + 'Number of node pools to create/upgrade/delete concurrently in Scenario A.', +) +_LARGE_SCALE_NODEPOOLS = flags.DEFINE_integer( + 'k8s_mgmt_large_scale_nodepools', + 1000, + 'Number of node pools to provision in the large-scale Scenario C. ' + + 'Spec target is 1000; ensure VPC/quota is available before running.', +) +_NODES_PER_NODEPOOL = flags.DEFINE_integer( + 'k8s_mgmt_nodes_per_nodepool', + 2, + 'Number of nodes per node pool. Google spec: 2 nodes per pool.', +) +_INITIAL_VERSION = flags.DEFINE_string( + 'k8s_mgmt_initial_version', + None, + 'Kubernetes version for newly-created node pools (N-1). None = auto.', +) +_TARGET_VERSION = flags.DEFINE_string( + 'k8s_mgmt_target_version', + None, + 'Kubernetes version to upgrade node pools to (N). None = cluster version.', +) +_SCENARIOS = flags.DEFINE_list( + 'k8s_mgmt_scenarios', + ['A', 'B', 'C'], + 'Comma-separated subset of scenarios to run. Valid values: A, B, C.', +) +_SCALE_SWEEP = flags.DEFINE_list( + 'k8s_mgmt_scale_sweep', + [], + 'Comma-separated list of node-pool counts for Scenario C scale sweep. ' + + 'Each scale runs as a separate sub-run with full create/delete cycle. ' + + 'Example: --k8s_mgmt_scale_sweep=10,50,100,500,1000. ' + + 'If empty, uses --k8s_mgmt_large_scale_nodepools.', +) +_MAX_CONCURRENT = flags.DEFINE_integer( + 'k8s_mgmt_max_concurrent', + 50, + 'Cap on concurrent provider API calls within a batch. ' + + 'Higher = faster but more aggressive on connection pools.', +) +_PIPELINE_SCENARIO_A = flags.DEFINE_boolean( + 'k8s_mgmt_pipeline_scenario_a', + True, + 'If True, run Scenario A as per-pool pipeline (create->upgrade->delete ' + + 'back-to-back per thread). Minimizes wall time. ' + + 'Default False for spec-strict phase-by-phase.', +) + +# AKS caps node-pool names at 12 chars — keep all names within that limit. +_PREFIX = 'pkbm' + + +def _ScenarioAName(i): + return f'{_PREFIX}a{i:03d}' + + +_SCENARIO_B_NAME = f'{_PREFIX}b' + + +def _ScenarioCName(i): + return f'{_PREFIX}c{i:04d}' + +@dataclasses.dataclass +class _OpResult: + """Holds timing and outcome for a single async management-plane operation.""" + name: str + init_dur: float + e2e_dur: float + error: Exception | None = None + + +def GetConfig(user_config): + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def CheckPrerequisites( + benchmark_config: benchmark_config_spec.BenchmarkConfigSpec,): + """Validates flag values and cluster type before any cloud calls.""" + invalid = [ + s for s in _SCENARIOS.value if s.strip() not in _VALID_SCENARIOS + ] + if invalid: + raise errors.Config.InvalidValue( + f'Invalid value(s) for --k8s_mgmt_scenarios: {invalid}. ' + + f'Valid options: {sorted(_VALID_SCENARIOS)}.') + for s in _SCALE_SWEEP.value: + try: + int(s.strip()) + except ValueError as e: + raise errors.Config.InvalidValue( + f'Non-integer value in --k8s_mgmt_scale_sweep: {s!r}') from e + if benchmark_config.container_cluster.type != 'Kubernetes': + raise errors.Config.InvalidValue( + 'kubernetes_management benchmark requires a Kubernetes' + + ' container cluster.') + + +def Prepare(benchmark_spec: bm_spec.BenchmarkSpec) -> None: + """Asserts the cluster is reachable; deploys spec-defined sleep workload.""" + cluster = benchmark_spec.container_cluster + assert isinstance(cluster, kubernetes_cluster.KubernetesCluster) + benchmark_spec.always_call_cleanup = True + logging.info( + 'kubernetes_management Prepare: cluster=%s, version=%s', + cluster.name, + cluster.k8s_version, + ) + # Spec workload: "a simple container that sleeps for a given time". + # Confirms data-plane reachability; generates no data-plane load. + kubectl.RunKubectlCommand( + [ + 'run', + _SLEEP_POD_NAME, + '--image=busybox', + '--restart=Never', + '--', + 'sleep', + '86400', + ], + ) + +def _CleanStartSweep(cluster: kubernetes_cluster.KubernetesCluster) -> None: + """Deletes any stale pkbm* node pools so each run starts clean (spec C.2).""" + try: + stale = [ + n for n in cluster.GetNodePoolNames() if n.startswith(_PREFIX) + ] + except Exception: # pylint: disable=broad-except + logging.exception('CleanStart: failed to list node pools') + return + if not stale: + logging.info( + 'CleanStart: no stale pools found — clean start confirmed.') + return + logging.warning('CleanStart: deleting %d stale pools: %s', len(stale), + stale) + background_tasks.RunThreaded(cluster.DeleteNodePool, stale) + + +def Run(benchmark_spec: bm_spec.BenchmarkSpec) -> list[sample.Sample]: + """Runs the selected scenarios and returns flat list of samples.""" + cluster = benchmark_spec.container_cluster + assert isinstance(cluster, kubernetes_cluster.KubernetesCluster) + + # Spec C.2: start clean. + _CleanStartSweep(cluster) + + # Resolve versions once; log clearly; tag every sample. + # Google spec: initial=N-1, target=N (adjacent minor upgrade). + flag_initial = _INITIAL_VERSION.value + flag_target = _TARGET_VERSION.value + if flag_initial and flag_target: + initial, target = flag_initial, flag_target + source = 'flags' + else: + resolved_initial, resolved_target = cluster.ResolveNodePoolVersions() + initial = flag_initial or resolved_initial + target = flag_target or resolved_target + source = 'auto-resolved' if not (flag_initial or flag_target) else 'mixed' + + logging.info( + 'NodePool versions (%s): initial=%s -> target=%s ' + + '(cluster k8s_version=%s) | nodes_per_pool=%d | machine_type=%s', + source, + initial, + target, + cluster.k8s_version, + _NODES_PER_NODEPOOL.value, + cluster.default_nodepool.machine_type + if hasattr(cluster, 'default_nodepool') else 'unknown', + ) + + scenarios = {s.strip().upper() for s in _SCENARIOS.value} + samples: list[sample.Sample] = [] + + if 'A' in scenarios: + samples += _RunScenarioA(cluster, initial, target) + if 'B' in scenarios: + samples += _RunScenarioB(cluster, initial) + if 'C' in scenarios: + # fix: Scenario A/B pools may still be in Deleting state and count + # toward AKS's 100-pool cluster limit. Sweep them out before Scenario C + # so we don't hit MaxAgentPoolCountReached mid-run. + _CleanStartSweep(cluster) + scales = ([int(x.strip()) for x in _SCALE_SWEEP.value] + if _SCALE_SWEEP.value else [_LARGE_SCALE_NODEPOOLS.value]) + logging.info('Scenario C: scale sweep = %s', scales) + for scale in scales: + scenario_c_samples = _RunScenarioC(cluster, initial, scale) + for s in scenario_c_samples: + s.metadata['scenario_c_scale'] = str(scale) + samples += scenario_c_samples + + # Tag all samples with version path and run config for published results. + run_meta = { + 'initial_version': str(initial), + 'target_version': str(target), + 'cluster_k8s_version': str(cluster.k8s_version), + 'nodes_per_nodepool': str(_NODES_PER_NODEPOOL.value), + 'concurrent_nodepools': str(_CONCURRENT_NODEPOOLS.value), + } + for s in samples: + s.metadata.update(run_meta) + + return samples + + +def Cleanup(benchmark_spec: bm_spec.BenchmarkSpec) -> None: + """Best-effort delete of leftover benchmark node pools and sleep pod.""" + cluster = benchmark_spec.container_cluster + if cluster is None: + return + kubectl.RunKubectlCommand( + ['delete', 'pod', _SLEEP_POD_NAME, '--ignore-not-found'], + raise_on_failure=False, + ) + try: + leftover = [ + n for n in cluster.GetNodePoolNames() if n.startswith(_PREFIX) + ] + except Exception: # pylint: disable=broad-except + logging.exception('Cleanup: failed to list node pools') + return + if not leftover: + return + logging.info('Cleanup: deleting %d leftover node pools', len(leftover)) + background_tasks.RunThreaded(cluster.DeleteNodePool, leftover) + + +# --------------------------------------------------------------------------- +# Scenario A +# --------------------------------------------------------------------------- + + +def _RunScenarioA( + cluster: kubernetes_cluster.KubernetesCluster, + initial: str, + target: str, +) -> list[sample.Sample]: + """Concurrent CreateNodePool, UpgradeNodePool, DeleteNodePool.""" + n = _CONCURRENT_NODEPOOLS.value + if _PIPELINE_SCENARIO_A.value: + logging.info( + 'Scenario A (pipelined): %d pools, initial=%s, target=%s', + n, initial, target) + return _RunScenarioAPipelined(cluster, n, initial, target) + + logging.info( + 'Scenario A (phase-by-phase): %d pools, initial=%s, target=%s', + n, initial, target) + pool_names = [_ScenarioAName(i) for i in range(n)] + configs_ = [_MakeNodePoolConfig(cluster, name) for name in pool_names] + samples: list[sample.Sample] = [] + + # ── Phase 1: concurrent creates ───────────────────────────────────────── + create_results = _RunAsync( + kickoff=lambda cfg: cluster.CreateNodePoolAsync( + cfg, node_version=initial), + wait_fn=cluster.WaitForOperation, + items=configs_, + get_name=lambda cfg: cfg.name, + ) + samples += _OpSamples('ScenarioA_Create', + create_results, + attempted_ops=len(pool_names)) + + # ── Phase 2: concurrent upgrades (only successfully created pools) ─────── + created = [r.name for r in create_results if r.error is None] + logging.info( + 'Scenario A: %d/%d pools created — proceeding to upgrade', + len(created), n) + upgrade_results = _RunAsync( + kickoff=lambda name: cluster.UpgradeNodePoolAsync(name, target), + wait_fn=cluster.WaitForOperation, + items=created, + get_name=str, + ) + samples += _OpSamples('ScenarioA_Upgrade', + upgrade_results, + attempted_ops=len(created)) + + # # ── Idiomatic Control Plane Synchronization Barrier ────────────────────── + # # Give the GKE control plane a brief window to register the async ops. + # time.sleep(15) + + # # Check if the cluster object has our native upgrade tracking capability. + # if hasattr(cluster, 'HasActiveUpgradeOperations'): + # logging.info('GCP GKE cluster detected; polling via provider API.') + + # while cluster.HasActiveUpgradeOperations(): + # logging.info( + # 'Upgrade operations active; holding delete phase for 30s.') + # time.sleep(30) + + # logging.info( + # 'All upgrade ops completed; flushing API gateway write-locks.') + # time.sleep(10) + # else: + # # Non-GCP providers (Azure AKS / AWS EKS): standard safety pause. + # logging.info( + # 'Non-GCP cluster; proceeding with stabilization pause.') + # time.sleep(5) + + # ── Phase 3: concurrent deletes (live-list to catch EKS rollbacks) ────── + alive = [ + p for p in cluster.GetNodePoolNames() if p.startswith(f'{_PREFIX}a') + ] + logging.info( + 'Scenario A: %d live pools found for delete (originally %d)', + len(alive), n) + delete_results = _RunAsync( + kickoff=cluster.DeleteNodePoolAsync, + wait_fn=cluster.WaitForOperation, + items=alive, + get_name=str, + ) + # attempted_ops=n: success rate reflects original request, not just live. + # EKS rolls back timed-out pools silently — without this shows 100%. + samples += _OpSamples('ScenarioA_Delete', delete_results, attempted_ops=n) + return samples + + +def _RunScenarioAPipelined( + cluster: kubernetes_cluster.KubernetesCluster, + n: int, + initial: str, + target: str, +) -> list[sample.Sample]: + """Per-pool pipeline: create->upgrade->delete back-to-back per thread. + + Minimizes wall time: max_i(create_i + upgrade_i + delete_i) vs + max(creates)+max(upgrades)+max(deletes) in phase-by-phase mode. + Trade-off: ops run under mixed-type concurrent load. + """ + pool_names = [_ScenarioAName(i) for i in range(n)] + creates = _Results() + upgrades = _Results() + deletes = _Results() + + def DoPool(name: str): + cfg = _MakeNodePoolConfig(cluster, name) + init, e2e, err = _TimedAsync( + lambda: cluster.CreateNodePoolAsync(cfg, node_version=initial), + cluster.WaitForOperation, + ) + creates.add(name, init, e2e, err) + if err is not None: + return + init, e2e, err = _TimedAsync( + lambda: cluster.UpgradeNodePoolAsync(name, target), + cluster.WaitForOperation, + ) + upgrades.add(name, init, e2e, err) + init, e2e, err = _TimedAsync( + lambda: cluster.DeleteNodePoolAsync(name), + cluster.WaitForOperation, + ) + deletes.add(name, init, e2e, err) + + background_tasks.RunThreaded( + DoPool, + pool_names, + max_concurrent_threads=min(n, _MAX_CONCURRENT.value), + ) + samples: list[sample.Sample] = [] + samples += _OpSamples('ScenarioA_Create', creates.entries, attempted_ops=n) + samples += _OpSamples('ScenarioA_Upgrade', + upgrades.entries, + attempted_ops=n) + samples += _OpSamples('ScenarioA_Delete', deletes.entries, attempted_ops=n) + return samples + + +# --------------------------------------------------------------------------- +# Scenario B +# --------------------------------------------------------------------------- + + +def _RunScenarioB( + cluster: kubernetes_cluster.KubernetesCluster, + initial: str, +) -> list[sample.Sample]: + """CreateNodePool fired concurrently with a long-running cluster update. + + Both ops kick off async on separate threads; initiation + E2E latency + recorded independently. Overlap window = ClusterUpdate E2E latency. + """ + logging.info('Scenario B: overlapping cluster update + node-pool create') + cfg = _MakeNodePoolConfig(cluster, _SCENARIO_B_NAME) + results = _Results() + + def DoClusterUpdate(): + init, e2e, err = _TimedAsync(cluster.UpdateClusterAsync, + cluster.WaitForOperation) + results.add('ScenarioB_ClusterUpdate', init, e2e, err) + logging.info('Scenario B ClusterUpdate: init=%.2fs e2e=%.2fs ok=%s', + init, e2e, err is None) + + def DoCreate(): + init, e2e, err = _TimedAsync( + lambda: cluster.CreateNodePoolAsync(cfg, node_version=initial), + cluster.WaitForOperation, + ) + results.add('ScenarioB_NodePoolCreate', init, e2e, err) + logging.info('Scenario B NodePoolCreate: init=%.2fs e2e=%.2fs ok=%s', + init, e2e, err is None) + + background_tasks.RunThreaded(lambda fn: fn(), + [DoClusterUpdate, DoCreate]) + + samples: list[sample.Sample] = [] + for entry in results.entries: + samples += _OpSamples(entry.name, [entry], attempted_ops=1) + + # Remove test pool (best-effort). + try: + cluster.DeleteNodePool(_SCENARIO_B_NAME) + except Exception: # pylint: disable=broad-except + logging.exception('Scenario B: failed to delete test pool') + return samples + + +# --------------------------------------------------------------------------- +# Scenario C +# --------------------------------------------------------------------------- + + +def _RunScenarioC( + cluster: kubernetes_cluster.KubernetesCluster, + initial: str, + scale: int, +) -> list[sample.Sample]: + """Large-scale node-pool provisioning at a given scale. + + Streams all `scale` creates through a single executor capped at + _MAX_CONCURRENT workers — as each op completes the next starts immediately + (no batch barriers). Delete uses a live-list so EKS-rolled-back pools are + excluded from the denominator correctly. + """ + logging.info( + 'Scenario C: scale=%d, max_concurrent=%d, initial_version=%s', + scale, + _MAX_CONCURRENT.value, + initial, + ) + pool_names = [_ScenarioCName(i) for i in range(scale)] + configs_ = [_MakeNodePoolConfig(cluster, name) for name in pool_names] + samples: list[sample.Sample] = [] + + # ── Creates ────────────────────────────────────────────────────────────── + create_results = _RunAsync( + kickoff=lambda cfg: cluster.CreateNodePoolAsync( + cfg, node_version=initial), + wait_fn=cluster.WaitForOperation, + items=configs_, + get_name=lambda cfg: cfg.name, + ) + created_ok = sum(1 for r in create_results if r.error is None) + logging.info('Scenario C scale=%d: %d/%d creates succeeded', scale, + created_ok, scale) + samples += _OpSamples('ScenarioC_Create', + create_results, + attempted_ops=scale) + + # ── Deletes (live-list) ────────────────────────────────────────────────── + alive = [ + p for p in cluster.GetNodePoolNames() if p.startswith(f'{_PREFIX}c') + ] + logging.info( + 'Scenario C scale=%d: %d live pools for delete (originally %d;' + + ' %d rolled back by cloud)', + scale, + len(alive), + scale, + scale - len(alive), + ) + if not alive: + logging.warning( + 'Scenario C scale=%d: 0 live pools — all timed-out creates were' + + ' rolled back. Recording 0%% delete success rate.', scale) + samples += _OpSamples('ScenarioC_Delete', [], attempted_ops=scale) + return samples + + delete_results = _RunAsync( + kickoff=cluster.DeleteNodePoolAsync, + wait_fn=cluster.WaitForOperation, + items=alive, + get_name=str, + ) + # attempted_ops=scale: accurate rate against original request count. + samples += _OpSamples('ScenarioC_Delete', + delete_results, + attempted_ops=scale) + return samples + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +class _Results: + """Thread-safe collector for (name, init_latency, e2e_latency, error).""" + + def __init__(self): + self._lock = threading.Lock() + self.entries: list[_OpResult] = [] + + def add(self, name: str, init_dur: float, e2e_dur: float, + err: Exception | None) -> None: + result = _OpResult(name, init_dur, e2e_dur, err) + with self._lock: + self.entries.append(result) + + +def _TimedAsync( + kickoff: Callable[[], str], + wait_fn: Callable[[str], None], +) -> tuple[float, float, Exception | None]: + """Runs kickoff() then wait_fn(handle); returns (init_lat, e2e_lat, err). + + init_lat = time for kickoff() to return (API accepted). + e2e_lat = total wall time including wait. On kickoff failure both are set + to elapsed time at failure point. + """ + init_start = time.monotonic() + try: + handle = kickoff() + except Exception as exc: # pylint: disable=broad-except + elapsed = time.monotonic() - init_start + return elapsed, elapsed, exc + init_dur = time.monotonic() - init_start + try: + wait_fn(handle) + return init_dur, time.monotonic() - init_start, None + except Exception as exc: # pylint: disable=broad-except + return init_dur, time.monotonic() - init_start, exc + + +def _RunAsync( + kickoff: Callable, + wait_fn: Callable[[str], None], + items: list, + get_name: Callable[[object], str], +) -> list[tuple[str, float, float, Exception | None]]: + """Fires kickoff(item) concurrently for all items; returns timed results. + + Uses background_tasks.RunThreaded with a concurrency cap for streaming + execution — completed ops free their slot immediately for the next one. + """ + if not items: + return [] + results = _Results() + cap = min(len(items), _MAX_CONCURRENT.value) + + def DoWrap(item): + init_dur, e2e_dur, err = _TimedAsync(lambda: kickoff(item), wait_fn) + name = get_name(item) + results.add(name, init_dur, e2e_dur, err) + logging.info('%s ok=%s initiation=%.2fs end_to_end=%.2fs', name, + err is None, init_dur, e2e_dur) + + background_tasks.RunThreaded(DoWrap, items, max_concurrent_threads=cap) + return results.entries + + +def _MakeNodePoolConfig( + cluster: kubernetes_cluster.KubernetesCluster, + name: str, +) -> container_lib.BaseNodePoolConfig: + """Builds a node-pool config from the cluster's default pool.""" + cfg = copy.copy(cluster.default_nodepool) + cfg.name = name + cfg.num_nodes = _NODES_PER_NODEPOOL.value + cfg.min_nodes = _NODES_PER_NODEPOOL.value + cfg.max_nodes = _NODES_PER_NODEPOOL.value + return cfg + + +def _OpSamples( + metric_prefix: str, + results: list[_OpResult], + attempted_ops: int | None = None, +) -> list[sample.Sample]: + """Per-op + aggregate samples for initiation and end-to-end latency. + + Args: + metric_prefix: prefix for all metric names. + results: list of (operation_name, init_lat, e2e_lat, err). + attempted_ops: total ops originally requested. Used as the denominator + for SuccessRate so EKS-rolled-back pools (which never + appear in results) are counted as failures, not ignored. + If None, len(results) is used (original behavior). + """ + samples: list[sample.Sample] = [] + init_latencies: list[float] = [] + e2e_latencies: list[float] = [] + success = 0 + + for r in results: + meta = {'operation_name': r.name, 'success': str(r.error is None)} + if r.error is not None: + meta['error'] = str(r.error)[:200] + else: + success += 1 + init_latencies.append(r.init_dur) + e2e_latencies.append(r.e2e_dur) + samples.append( + sample.Sample(f'{metric_prefix}_InitiationLatency', r.init_dur, + 'seconds', dict(meta))) + samples.append( + sample.Sample(f'{metric_prefix}_EndToEndLatency', r.e2e_dur, + 'seconds', dict(meta))) + + # ── Success rate ───────────────────────────────────────────────────────── + total = attempted_ops if attempted_ops is not None else len(results) + executed = len(results) + if total > 0: + samples.append( + sample.Sample( + f'{metric_prefix}_SuccessRate', + 100.0 * success / total, + 'percent', + { + 'total_ops': str(total), + 'executed_ops': str(executed), + 'successful_ops': str(success), + 'skipped_ops': str(total - executed), + }, + )) + + # ── Aggregate stats (successful ops only) ──────────────────────────────── + for phase_label, latencies in ( + ('InitiationLatency', init_latencies), + ('EndToEndLatency', e2e_latencies), + ): + if len(latencies) >= 2: + samples += _AggregateSamples(metric_prefix, phase_label, latencies) + if len(latencies) >= 4: + samples += _OutlierSamples(metric_prefix, phase_label, latencies) + + return samples + + +def _AggregateSamples(metric_prefix: str, phase_label: str, + latencies: list[float]) -> list[sample.Sample]: + """Emits Mean/StdDev/Min/Median/P90/P99/Max samples for a latency series.""" + n = len(latencies) + meta = {'sample_count': str(n)} + + # statistics.quantiles with method='inclusive' matches linear interpolation + # and returns n-1 cut points; index 89→P90, 98→P99. + quantiles = statistics.quantiles(latencies, n=100, method='inclusive') + + stats = [ + ('Mean', statistics.mean(latencies)), + ('StdDev', statistics.pstdev(latencies)), + ('Min', min(latencies)), + ('Median', statistics.median(latencies)), + ('P90', quantiles[89]), + ('P99', quantiles[98]), + ('Max', max(latencies)), + ] + result = [] + for label, value in stats: + result.append( + sample.Sample( + f'{metric_prefix}_{phase_label}_{label}', + value, + 'seconds', + dict(meta), + )) + return result + + +def _OutlierSamples(metric_prefix: str, phase_label: str, + latencies: list[float]) -> list[sample.Sample]: + """Emits a single OutlierCount sample using IQR-fence outlier detection.""" + # statistics.quantiles(n=4) returns [Q1, Q2, Q3]; indices 0 and 2. + quartiles = statistics.quantiles(latencies, n=4, method='inclusive') + q1, q3 = quartiles[0], quartiles[2] + iqr = q3 - q1 + lower_fence = q1 - 1.5 * iqr + upper_fence = q3 + 1.5 * iqr + outlier_count = sum( + 1 for v in latencies if v < lower_fence or v > upper_fence + ) + meta = { + 'q1': str(q1), + 'q3': str(q3), + 'iqr': str(iqr), + 'upper_fence': str(upper_fence), + 'lower_fence': str(lower_fence), + 'sample_count': str(len(latencies)), + } + return [ + sample.Sample( + f'{metric_prefix}_{phase_label}_OutlierCount', + outlier_count, + 'count', + meta, + ) + ] diff --git a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py index 15b9adb5a8..c02b820ee7 100644 --- a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py +++ b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py @@ -26,6 +26,7 @@ import logging import math import re +import threading from typing import Any from urllib import parse @@ -45,6 +46,13 @@ from perfkitbenchmarker.resources.container_service import kubernetes_cluster from perfkitbenchmarker.resources.container_service import kubernetes_commands +# Flag to skip EBS CSI driver setup during cluster creation. +# The kubernetes_management benchmark does not use persistent volumes, so +# EBS CSI setup (OIDC + IAM role + addon install) is unnecessary and adds +# ~3 minutes to every run. Set to True to skip it and save time. +# Defined before FLAGS = flags.FLAGS so it is registered at import time +# and visible to PKB's flag parser before --cloud/--container_cluster_type +# are resolved. FLAGS = flags.FLAGS # GPU types which practically require spot to get. _RARE_GPU_TYPES = [ @@ -54,7 +62,7 @@ ] -def RecursivelyUpdateDictionary( +def _recursively_update_dictionary( original: dict[str, Any], updates: dict[str, Any] ) -> dict[str, Any]: """Updates a nested dictionary. @@ -72,14 +80,14 @@ def RecursivelyUpdateDictionary( # Copied from https://stackoverflow.com/questions/3232943 for k, v in updates.items(): if isinstance(v, abc.Mapping): - original[k] = RecursivelyUpdateDictionary(original.get(k, {}), v) + original[k] = _recursively_update_dictionary(original.get(k, {}), v) else: original[k] = v return original class BaseEksCluster(kubernetes_cluster.KubernetesCluster): - """Shared base class for Elastic Kubernetes Service cluster auto mode & not.""" + """Shared base class for EKS cluster (auto mode and standard).""" def __init__(self, spec): # EKS requires a region and optionally a list of one or zones. @@ -107,6 +115,9 @@ def __init__(self, spec): self.account: str = util.GetAccount() self.node_to_nodepool: dict[str, container.BaseNodePoolConfig | None] = {} self.node_to_machine_type: dict[str, str | None] = {} + self._cached_subnets: list[str] | None = None + self._cached_subnets_per_az: dict[str, str] | None = None + self._cached_node_role_arn: str | None = None def _ChooseSecondZone(self): """Choose a second zone for the control plane if only one is specified.""" @@ -118,23 +129,30 @@ def _ChooseSecondZone(self): self.region + ('b' if self.zone.endswith('a') else 'a') ) - def _CreateDependencies(self): - """Set up the ssh key.""" - aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region) - def _DeleteDependencies(self): - """Delete the ssh key.""" - aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region) + + def _EksCtlCreate(self, create_json: dict[str, Any]): """Creates the EKS cluster.""" - # If multiple zones are passed use them for the control plane. - # Otherwise EKS will auto-select control plane zones in the region. - if self.control_plane_zones: - create_json['availabilityZones'] = self.control_plane_zones + # Pass all control_plane_zones to the cluster so eksctl creates VPC subnets + # in every requested AZ. Without this, eksctl may only create subnets in 2 + # AZs even when 3 are requested, preventing round-robin nodegroup placement. + # This is critical for distributing nodegroups across AZs to avoid per-AZ + # EC2 capacity limits. + # availabilityZones is already set in create_json by _CreateDependencies + # via the EC2 AZ query (bypassing PKB zone flag truncation). + # Log it here for visibility. + if 'availabilityZones' in create_json: + logging.info( + '[EKS] Creating cluster with AZs: %s — ' + + 'eksctl will auto-assign CIDRs for all %d zones.', + create_json['availabilityZones'], + len(create_json['availabilityZones']), + ) # Schema for the cluster create command is here: # https://schema.eksctl.io/ - create_json = RecursivelyUpdateDictionary( + create_json = _recursively_update_dictionary( { 'apiVersion': 'eksctl.io/v1alpha5', 'kind': 'ClusterConfig', @@ -185,6 +203,11 @@ def _RenderNodeGroupJson( if nodepool.min_nodes != nodepool.max_nodes: group_json['minSize'] = nodepool.min_nodes group_json['maxSize'] = nodepool.max_nodes + # Pin the default nodegroup to control_plane_zones[0] so it stays in a + # single known AZ. The benchmark nodegroups (pkbma*, pkbmc*) are placed + # via CreateNodePoolAsync using the round-robin _DiscoverSubnetsPerAZ logic. + if self.control_plane_zones: + group_json['availabilityZones'] = [self.control_plane_zones[0]] return group_json def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str: @@ -211,6 +234,34 @@ def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str: def _Delete(self): """Deletes the control plane and worker nodes.""" + # Clean up SSH key pair — safety net in case _DeleteDependencies didn't run + try: + aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region) + except Exception: # pylint: disable=broad-except + pass + # Clean up dynamically created launch templates and capacity reservations + # Only runs if capacity reservations were actually created this run. + if getattr(FLAGS, 'eks_reserve_capacity_per_az', False): + for az in getattr(self, '_capacity_reservation_ids', {}).keys(): + vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'delete-launch-template', + '--launch-template-name', f'pkb-eks-lt-{az}', + '--region', self.region, + ], + raise_on_failure=False, + ) + logging.info('[EKS] Deleted launch template pkb-eks-lt-%s', az) + for az, res_id in getattr(self, '_capacity_reservation_ids', {}).items(): + vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'cancel-capacity-reservation', + '--capacity-reservation-id', res_id, + '--region', self.region, + ], + raise_on_failure=False, + ) + logging.info('[EKS] Cancelled capacity reservation %s in %s', res_id, az) super()._Delete() cmd = [ FLAGS.eksctl, @@ -392,60 +443,265 @@ def GetResourceMetadata(self): def _Create(self): """Creates the control plane and worker nodes.""" + # Import SSH key pair to EC2 before cluster creation — eksctl requires it. + aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region) nodepool_jsons = [self._RenderNodeGroupJson(self.default_nodepool)] for _, node_group in self.nodepools.items(): nodepool_jsons += [self._RenderNodeGroupJson(node_group)] create_json: dict[str, Any] = { 'managedNodeGroups': nodepool_jsons, - 'vpc': { - 'nat': {'gateway': 'Disable'}, - }, + 'vpc': {'nat': {'gateway': 'Disable'}}, } + # Explicitly set cluster-level availabilityZones so eksctl creates VPC + # public+private subnets in ALL AZs in the region. + # IMPORTANT: PKB's deprecated --zones flag gets truncated by its own + # translation layer to 2 AZs even when 3 are specified. We bypass this + # by querying EC2 directly for all available AZs in the region and + # passing all of them to eksctl. This ensures the VPC gets subnets in + # all AZs, enabling proper round-robin nodegroup placement. + try: + az_out, _, az_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'describe-availability-zones', + '--region', self.region, + '--filters', 'Name=state,Values=available', + '--query', 'AvailabilityZones[*].ZoneName', + '--output', 'json', + ], + raise_on_failure=False, + ) + if az_rc == 0 and az_out.strip(): + all_azs = json.loads(az_out.strip()) + # Limit to 3 AZs maximum to avoid excessive subnet creation + cluster_azs = sorted(all_azs)[:3] + else: + # Fallback: use control_plane_zones or default to known us-east-1 AZs + cluster_azs = ( + self.control_plane_zones + if self.control_plane_zones + else [f'{self.region}a', f'{self.region}b', f'{self.region}c'] + ) + except Exception: # pylint: disable=broad-except + cluster_azs = ( + self.control_plane_zones + if self.control_plane_zones + else [f'{self.region}a', f'{self.region}b', f'{self.region}c'] + ) + + create_json['availabilityZones'] = cluster_azs + logging.info( + '[EKS] Cluster will have subnets in %d AZs: %s ' + + '(queried from EC2, bypassing PKB zone flag truncation)', + len(cluster_azs), cluster_azs, + ) self._EksCtlCreate(create_json) + # Dynamically create capacity reservations + launch templates AFTER cluster + # creation so cluster CA and endpoint are available for node bootstrap. + # Gate capacity reservations behind flag — disabled by default + # to avoid impacting other EKS benchmarks (kubernetes_nginx etc) + # that use different instance types and do not need reservations. + if not FLAGS.eks_reserve_capacity_per_az: + self._capacity_reservation_ids = {} + logging.info( + '[EKS] Skipping capacity reservations ' + '(--eks_reserve_capacity_per_az=False)' + ) + else: + self._capacity_reservation_ids = {} + # Reserve enough capacity per AZ for 100 pools: + # ~67 pools per AZ × 2 nodes = 134 instances max per AZ (Scenario A) + # Plus default nodegroup (2) + buffer = 80 minimum for 10 pools, 150 for 100 pools + concurrent = getattr(FLAGS, 'k8s_mgmt_concurrent_nodepools', 10) + nodes_per_az = max(80, concurrent * 2 + 20) + # Fetch cluster CA and endpoint for bootstrap user data + import json as _json + cluster_out, _, cluster_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'eks', 'describe-cluster', + '--name', self.name, + '--region', self.region, + '--query', 'cluster.{endpoint:endpoint,ca:certificateAuthority.data,cidr:kubernetesNetworkConfig.serviceIpv4Cidr}', + '--output', 'json', + ], + raise_on_failure=False, + ) + cluster_ca = '' + cluster_endpoint = '' + cluster_service_cidr = '10.100.0.0/16' # default fallback + if cluster_rc == 0 and cluster_out.strip(): + cluster_info = _json.loads(cluster_out.strip()) + cluster_ca = cluster_info.get('ca', '') + cluster_endpoint = cluster_info.get('endpoint', '') + cluster_service_cidr = cluster_info.get('cidr', '10.100.0.0/16') + logging.info('[EKS] Fetched cluster endpoint=%s cidr=%s for bootstrap', + cluster_endpoint, cluster_service_cidr) + + # Query EKS-optimized AMI once for all AZs + # cluster_version may be None if not explicitly set — fetch from cluster + if not self.cluster_version: + ver_out, _, ver_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'eks', 'describe-cluster', + '--name', self.name, + '--region', self.region, + '--query', 'cluster.version', + '--output', 'text', + ], + raise_on_failure=False, + ) + if ver_rc != 0 or not ver_out.strip(): + raise errors.Resource.CreationError( + '[EKS] Failed to determine cluster version from describe-cluster. ' + 'Cannot proceed without a valid Kubernetes version. ' + f'rc={ver_rc} out={ver_out.strip()!r}' + ) + self.cluster_version = ver_out.strip() + logging.info('[EKS] Resolved cluster version: %s', self.cluster_version) + k8s_minor_str = '.'.join(self.cluster_version.split('.')[:2]) + ami_out, _, ami_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ssm', 'get-parameter', + '--name', ( + f'/aws/service/eks/optimized-ami/{k8s_minor_str}/' + 'amazon-linux-2023/x86_64/standard/recommended/image_id' + ), + '--region', self.region, + '--query', 'Parameter.Value', + '--output', 'text', + ], + raise_on_failure=False, + ) + ami_id = ami_out.strip() if ami_rc == 0 and ami_out.strip() else '' + logging.info('[EKS] EKS AMI for K8s %s: %s', k8s_minor_str, ami_id) + + for az in cluster_azs: + logging.info('[EKS] Creating capacity reservation in %s (%d instances)...', az, nodes_per_az) + cap_out, _, cap_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'create-capacity-reservation', + '--instance-type', 't3.medium', + '--instance-platform', 'Linux/UNIX', + '--availability-zone', az, + '--instance-count', str(nodes_per_az), + '--region', self.region, + '--query', 'CapacityReservation.CapacityReservationId', + '--output', 'text', + ], + raise_on_failure=False, + ) + if cap_rc == 0 and cap_out.strip() and cap_out.strip() != 'None': + res_id = cap_out.strip() + self._capacity_reservation_ids[az] = res_id + logging.info('[EKS] Created capacity reservation %s in %s', res_id, az) + if ami_id and cluster_ca and cluster_endpoint: + import base64 as _b64 + # AL2023 uses nodeadm YAML config — NOT the old bootstrap.sh + nodeadm_config = ( + 'apiVersion: node.eks.aws/v1alpha1' + chr(10) + + 'kind: NodeConfig' + chr(10) + + 'spec:' + chr(10) + + ' cluster:' + chr(10) + + f' name: {self.name}' + chr(10) + + f' apiServerEndpoint: {cluster_endpoint}' + chr(10) + + f' certificateAuthority: {cluster_ca}' + chr(10) + + f' cidr: {cluster_service_cidr}' + ) + user_data = _b64.b64encode(('MIME-Version: 1.0' + chr(10) + + 'Content-Type: multipart/mixed; boundary="==BOUNDARY=="' + chr(10) + + chr(10) + + '--==BOUNDARY==' + chr(10) + + 'Content-Type: application/node.eks.aws' + chr(10) + + chr(10) + + nodeadm_config + chr(10) + + '--==BOUNDARY==--').encode()).decode() + logging.info('[EKS] Using AL2023 nodeadm bootstrap for %s', az) + lt_data = ( + '{' + f'"ImageId":"{ami_id}",' + '"CapacityReservationSpecification":{' + '"CapacityReservationPreference":"capacity-reservations-only",' + f'"CapacityReservationTarget":{{"CapacityReservationId":"{res_id}"}}}},' + f'"UserData":"{user_data}"' + '}' + ) + _, _, lt_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'create-launch-template', + '--region', self.region, + '--launch-template-name', f'pkb-eks-lt-{az}', + '--launch-template-data', lt_data, + ], + raise_on_failure=False, + ) + if lt_rc == 0: + logging.info( + '[EKS] Created launch template pkb-eks-lt-%s (AMI=%s) -> %s', + az, ami_id, res_id, + ) + else: + logging.warning('[EKS] Failed to create launch template for %s', az) + else: + logging.warning('[EKS] Missing AMI/CA/endpoint — no launch template for %s', az) + else: + logging.warning('[EKS] Failed to create capacity reservation in %s — on-demand', az) + # Above create command passes "withOidc=true", but it doesn't seem to work & # therefore this command is needed. - cmd = [ - FLAGS.eksctl, - 'utils', - 'associate-iam-oidc-provider', - f'--cluster={self.name}', - f'--region={self.region}', - '--approve', - ] - vm_util.IssueCommand(cmd) + if not FLAGS.eks_skip_ebs_csi: + cmd = [ + FLAGS.eksctl, + 'utils', + 'associate-iam-oidc-provider', + f'--cluster={self.name}', + f'--region={self.region}', + '--approve', + ] + vm_util.IssueCommand(cmd) # EBS CSI driver is required for creating EBS volumes in version > 1.23 # https://docs.aws.amazon.com/eks/latest/userguide/ebs-csi.html + # Skip if --eks_skip_ebs_csi is set (saves ~3 min for benchmarks that + # do not use persistent volumes, such as kubernetes_management). + if FLAGS.eks_skip_ebs_csi: + logging.info( + '[EKS] Skipping EBS CSI driver setup (--eks_skip_ebs_csi=True). ' + + 'Saves ~3 min. Set to False if benchmark needs persistent volumes.' + ) + else: + # Name must be unique. + ebs_csi_driver_role = f'AmazonEKS_EBS_CSI_DriverRole_{self.name}' + + ebs_policy_arn = ( + 'arn:aws:iam::aws:policy/service-role/' + + 'AmazonEBSCSIDriverPolicy') + cmd = [ + FLAGS.eksctl, + 'create', + 'iamserviceaccount', + '--name=ebs-csi-controller-sa', + '--namespace=kube-system', + f'--region={self.region}', + f'--cluster={self.name}', + f'--attach-policy-arn={ebs_policy_arn}', + '--approve', + '--role-only', + f'--role-name={ebs_csi_driver_role}', + ] + vm_util.IssueCommand(cmd) - # Name must be unique. - ebs_csi_driver_role = f'AmazonEKS_EBS_CSI_DriverRole_{self.name}' - - cmd = [ - FLAGS.eksctl, - 'create', - 'iamserviceaccount', - '--name=ebs-csi-controller-sa', - '--namespace=kube-system', - f'--region={self.region}', - f'--cluster={self.name}', - '--attach-policy-arn=arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy', - '--approve', - '--role-only', - f'--role-name={ebs_csi_driver_role}', - ] - vm_util.IssueCommand(cmd) - - cmd = [ - FLAGS.eksctl, - 'create', - 'addon', - '--name=aws-ebs-csi-driver', - f'--region={self.region}', - f'--cluster={self.name}', - f'--service-account-role-arn=arn:aws:iam::{self.account}:role/{ebs_csi_driver_role}', - ] - vm_util.IssueCommand(cmd) + svc_acct_arn = ( + f'arn:aws:iam::{self.account}:role/{ebs_csi_driver_role}') + cmd = [ + FLAGS.eksctl, + 'create', + 'addon', + '--name=aws-ebs-csi-driver', + f'--region={self.region}', + f'--cluster={self.name}', + f'--service-account-role-arn={svc_acct_arn}', + ] + vm_util.IssueCommand(cmd) if aws_flags.AWS_EKS_POD_IDENTITY_ROLE.value: cmd = util.AWS_PREFIX + [ @@ -526,6 +782,703 @@ def ResizeNodePool( ] vm_util.IssueCommand(cmd) + def CreateNodePool( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> None: + """Creates a single managed node group on the cluster.""" + ng_json = self._RenderNodeGroupJson(nodepool_config) + if node_version: + ng_json['version'] = node_version + config_json = { + 'apiVersion': 'eksctl.io/v1alpha5', + 'kind': 'ClusterConfig', + 'metadata': { + 'name': self.name, + 'region': self.region, + }, + 'managedNodeGroups': [ng_json], + } + filename = self._WriteJsonToFile(config_json) + cmd = [ + FLAGS.eksctl, + 'create', + 'nodegroup', + f'--config-file={filename}', + ] + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=1800, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + + def DeleteNodePool(self, name: str) -> None: + """Deletes the named node group.""" + cmd = [ + FLAGS.eksctl, + 'delete', + 'nodegroup', + f'--name={name}', + f'--cluster={self.name}', + f'--region={self.region}', + '--wait', + ] + vm_util.IssueCommand(cmd, timeout=1800) + + def UpgradeNodePool(self, name: str, target_version: str) -> None: + """Upgrades the named node group to target_version.""" + cmd = [ + FLAGS.eksctl, + 'upgrade', + 'nodegroup', + f'--name={name}', + f'--cluster={self.name}', + f'--region={self.region}', + f'--kubernetes-version={target_version}', + '--wait', + ] + vm_util.IssueCommand(cmd, timeout=1800) + + # ---- Async variants (return opaque handles) ------------------------------- + + def _DiscoverSubnets(self) -> list[str]: + """Returns the EKS cluster's subnet IDs (cached after first call).""" + if getattr(self, '_cached_subnets', None): + return self._cached_subnets + out, _, _ = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'describe-cluster', + '--name', + self.name, + '--region', + self.region, + ] + ) + info = json.loads(out) + self._cached_subnets = info['cluster']['resourcesVpcConfig']['subnetIds'] + return self._cached_subnets + + def _DiscoverSubnetsPerAZ(self) -> dict[str, str]: + """Returns a mapping of {AZ: subnet_id} for the cluster's subnets. + + Used by CreateNodePoolAsync to distribute nodegroups round-robin across + AZs, avoiding per-AZ EC2 capacity limits when creating many pools. + Only returns AZs that are in control_plane_zones (if specified). + Cached after first call. + """ + if getattr(self, '_cached_subnets_per_az', None) is not None: + return self._cached_subnets_per_az + + subnet_ids = self._DiscoverSubnets() + if not subnet_ids: + self._cached_subnets_per_az = {} + return {} + + # Describe subnets to get their AZ mapping + out, _, rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'describe-subnets', + '--region', self.region, + '--subnet-ids', *subnet_ids, + '--query', 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,Public:MapPublicIpOnLaunch}', + '--output', 'json', + ], + raise_on_failure=False, + ) + if rc: + logging.warning( + '[EKS] Could not describe subnets for AZ mapping — ' + + 'falling back to all subnets' + ) + self._cached_subnets_per_az = {} + return {} + + subnets = json.loads(out) + + # Do NOT filter by control_plane_zones — PKB truncates it to 2 AZs. + # Accept all subnets the VPC has across all AZs. + # Build AZ map — always prefer public subnets (MapPublicIpOnLaunch=True) + # which have an internet gateway route. Private subnets lack IGW routes + # and nodes launched there cannot reach the EKS API server to join. + az_map: dict[str, str] = {} + az_map_private: dict[str, str] = {} + for s in subnets: + az = s['AZ'] + if s.get('Public'): + az_map[az] = s['SubnetId'] + logging.info('[EKS] AZ %s → public subnet %s', az, s['SubnetId']) + elif az not in az_map: + az_map_private[az] = s['SubnetId'] + for az, sid in az_map_private.items(): + if az not in az_map: + logging.warning('[EKS] AZ %s has no public subnet — using private %s', az, sid) + az_map[az] = sid + + logging.info( + '[EKS] Subnet-per-AZ mapping: %s (from %d total subnets)', + az_map, len(subnet_ids), + ) + self._cached_subnets_per_az = az_map + return az_map + + def _ResolveReleaseVersion(self, minor: str) -> str: + """Returns the EKS-optimized AMI release version (e.g. '1.33.10-20260124'). + + Used to populate `releaseVersion` in the create-nodegroup payload so the + benchmark can pin specific K8s minors. Thread-safe: at scale we have N + workers all asking for the same minor; only the first does the SSM + lookup, the rest read from the cache. + """ + if getattr(self, '_release_version_lock', None) is None: + self._release_version_lock = threading.Lock() + with self._release_version_lock: + cache = getattr(self, '_cached_release_versions', None) or {} + if minor in cache: + return cache[minor] + cmd = util.AWS_PREFIX + [ + 'ssm', + 'get-parameter', + '--name', + ( + f'/aws/service/eks/optimized-ami/{minor}/amazon-linux-2023/' + 'x86_64/standard/recommended/release_version' + ), + '--region', + self.region, + '--query', + 'Parameter.Value', + '--output', + 'text', + ] + out, err, rc = vm_util.IssueCommand(cmd, raise_on_failure=False) + if rc: + raise errors.Resource.CreationError( + f'Failed to resolve EKS release version for minor {minor!r}: {err}' + ) + cache[minor] = out.strip() + self._cached_release_versions = cache + return cache[minor] + + def _DiscoverNodeRoleArn(self) -> str: + """Returns a node IAM role ARN by inspecting an existing nodegroup.""" + if getattr(self, '_cached_node_role_arn', None): + return self._cached_node_role_arn + out, _, _ = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'list-nodegroups', + '--cluster-name', + self.name, + '--region', + self.region, + ] + ) + for ng_name in json.loads(out).get('nodegroups', []): + ng_out, _, _ = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'describe-nodegroup', + '--cluster-name', + self.name, + '--nodegroup-name', + ng_name, + '--region', + self.region, + ] + ) + role = json.loads(ng_out)['nodegroup'].get('nodeRole') + if role: + self._cached_node_role_arn = role + return role + raise errors.Resource.CreationError( + f'No existing nodegroup found to discover node role for ' + f'cluster {self.name}.' + ) + + def CreateNodePoolAsync( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> str: + # Pass the full request via --cli-input-json so that we can specify both + # `version` (e.g. "1.33") and `releaseVersion` (e.g. "1.33.11-...") in + # the same call. Two reasons this matters: + # 1. AWS CLI v1 has a bug where the top-level --version flag swallows + # the subcommand --version, printing the CLI banner and exiting. + # cli-input-json sidesteps CLI argument parsing entirely. + # 2. EKS rejects a releaseVersion that doesn't match the request's + # `version`; if `version` is omitted EKS defaults it to the + # cluster's version, which (for the N-1 -> N benchmark path) + # produces a "release version X is not valid for kubernetes + # version Y" error. + + # ── AZ distribution ──────────────────────────────────────────────────── + # When multiple zones are specified (e.g. us-east-1a,1b,1c), distribute + # nodegroups round-robin across AZs to avoid per-AZ EC2 capacity limits. + # Without this, EKS places all nodegroups in a single AZ causing timeouts. + # Pool name format: pkbma000, pkbma001, ... — extract index from suffix. + az_subnets = self._DiscoverSubnetsPerAZ() + if az_subnets and len(az_subnets) > 1: + # Extract numeric suffix from pool name to determine AZ assignment + name = nodepool_config.name + suffix = ''.join(c for c in name if c.isdigit()) + # pkbmb (Scenario B) has no suffix — assign to us-east-1b (idx=1) + # to avoid competing with us-east-1a which has the default nodegroup. + idx = int(suffix) if suffix else 1 + zones = sorted(az_subnets.keys()) + assigned_az = zones[idx % len(zones)] + subnets = [az_subnets[assigned_az]] + logging.info( + '[EKS] CreateNodePool %s -> AZ=%s subnet=%s (round-robin idx=%d)', + name, assigned_az, subnets[0], idx, + ) + else: + subnets = self._DiscoverSubnets() + logging.info('[EKS] CreateNodePool %s -> using all subnets (single AZ)', + nodepool_config.name) + + payload: dict[str, Any] = { + 'clusterName': self.name, + 'nodegroupName': nodepool_config.name, + 'scalingConfig': { + 'minSize': nodepool_config.num_nodes, + 'maxSize': nodepool_config.num_nodes, + 'desiredSize': nodepool_config.num_nodes, + }, + 'subnets': subnets, + 'instanceTypes': [nodepool_config.machine_type], + 'amiType': 'AL2023_x86_64_STANDARD', + 'nodeRole': self._DiscoverNodeRoleArn(), + 'labels': {'pkb_nodepool': nodepool_config.name}, + 'tags': util.MakeDefaultTags(), + # Target open capacity reservations first before falling back to + # regular on-demand. Ensures EC2 capacity reservations created + # before the benchmark are actually used by EKS nodegroups. + 'capacityReservationSpecification': { + 'capacityReservationPreference': 'open', + }, + } + _az = assigned_az if az_subnets and len(az_subnets) > 1 else f'{self.region}a' + # Only look up launch templates and capacity reservations when + # --eks_reserve_capacity_per_az=true. Other benchmarks skip this entirely. + if FLAGS.eks_reserve_capacity_per_az: + _lt_name = f'pkb-eks-lt-{_az}' + _lt_out, _, _lt_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'describe-launch-templates', + '--region', self.region, + '--filters', f'Name=launch-template-name,Values={_lt_name}', + '--query', 'LaunchTemplates[0].LaunchTemplateId', + '--output', 'text', + ], + raise_on_failure=False, + ) + res_id = self._capacity_reservation_ids.get(_az, '') + if res_id and _lt_rc == 0 and _lt_out.strip() and _lt_out.strip() not in ('None', 'null', ''): + payload['launchTemplate'] = {'id': _lt_out.strip(), 'version': '$Latest'} + # When launch template specifies an ImageId, EKS rejects these fields: + # - releaseVersion: conflicts with AMI + # - instanceTypes: must come from launch template only + # - amiType: conflicts with AMI + payload.pop('releaseVersion', None) + payload.pop('instanceTypes', None) + payload.pop('amiType', None) + logging.info( + '[EKS] Nodegroup %s using launch template %s targeting reservation %s in AZ %s', + nodepool_config.name, _lt_name, res_id, _az, + ) + else: + logging.warning('[EKS] No reservation/template for AZ %s — using on-demand', _az) + + if node_version: + # EKS rejects both 'version' and 'releaseVersion' when a launch template + # with ImageId is specified — skip both when launchTemplate is in use. + if 'launchTemplate' not in payload: + payload['version'] = node_version + payload['releaseVersion'] = self._ResolveReleaseVersion(node_version) + filename = self._WriteJsonToFile(payload) + cmd = util.AWS_PREFIX + [ + 'eks', + 'create-nodegroup', + '--region', + self.region, + '--cli-input-json', + f'file://{filename}', + ] + # Retry on EC2 RunInstances throttling at high concurrency (99 pools). + max_retries = 5 + base_delay = 10 + for attempt in range(max_retries): + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) + if retcode == 0: + break + if 'Request limit exceeded' in stderr or 'ThrottlingException' in stderr: + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + logging.warning( + '[EKS] CreateNodegroup %s throttled — retry %d/%d in %ds', + nodepool_config.name, attempt + 1, max_retries, delay, + ) + time.sleep(delay) + continue + raise errors.Resource.CreationError(stderr) + else: + raise errors.Resource.CreationError( + f'CreateNodegroup {nodepool_config.name} failed after retries: {stderr}' + ) + return f'ng_active:{nodepool_config.name}' + + def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str: + # For Custom AMI nodegroups (using launch template with ImageId), + # EKS requires the launch template to be passed on upgrade. + # Determine the AZ for this nodegroup to find the correct launch template. + suffix = ''.join(c for c in name if c.isdigit()) + # pkbmb (Scenario B) has no suffix — use idx=1 (us-east-1b) to avoid + # competing with us-east-1a which already has the default nodegroup + idx = int(suffix) if suffix else 1 + az_subnets = self._DiscoverSubnetsPerAZ() + if az_subnets and len(az_subnets) > 1: + zones = sorted(az_subnets.keys()) + _az = zones[idx % len(zones)] + else: + _az = f'{self.region}a' + # Only look up launch template when capacity reservations are enabled. + # For other benchmarks, always use standard kubernetes-version upgrade. + lt_id = '' + _lt_name = '' + if FLAGS.eks_reserve_capacity_per_az: + _lt_name = f'pkb-eks-lt-{_az}' + lt_out, _, lt_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'describe-launch-templates', + '--region', self.region, + '--filters', f'Name=launch-template-name,Values={_lt_name}', + '--query', 'LaunchTemplates[0].LaunchTemplateId', + '--output', 'text', + ], + raise_on_failure=False, + ) + lt_id = lt_out.strip() if lt_rc == 0 and lt_out.strip() not in ('', 'None', 'null') else '' + + # Custom AMI nodegroups cannot use --kubernetes-version — use launch template only + if lt_id: + cmd = util.AWS_PREFIX + [ + 'eks', 'update-nodegroup-version', + '--cluster-name', self.name, + '--nodegroup-name', name, + '--region', self.region, + '--launch-template', f'id={lt_id},version=$Latest', + ] + logging.info('[EKS] Upgrading %s with launch template %s in AZ %s', + name, _lt_name, _az) + else: + cmd = util.AWS_PREFIX + [ + 'eks', 'update-nodegroup-version', + '--cluster-name', self.name, + '--nodegroup-name', name, + '--region', self.region, + '--kubernetes-version', target_version, + ] + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + return f'ng_active:{name}' + + def DeleteNodePoolAsync(self, name: str) -> str: + cmd = util.AWS_PREFIX + [ + 'eks', + 'delete-nodegroup', + '--cluster-name', + self.name, + '--nodegroup-name', + name, + '--region', + self.region, + ] + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + return f'ng_gone:{name}' + + def UpdateClusterAsync(self) -> str: + """Fires a CloudWatch logging toggle; returns handle 'cluster_update:'. + + Returns a handle carrying the specific update id so WaitForOperation + can poll *that* update's status (Successful / Failed) rather than the + cluster's top-level status (which stays ACTIVE during config updates, + making the wait return instantly and silently mis-reporting latency). + """ + log_types = ['api', 'audit', 'authenticator', 'controllerManager', + 'scheduler'] + describe = util.AWS_PREFIX + [ + 'eks', + 'describe-cluster', + '--name', + self.name, + '--region', + self.region, + ] + out, _, _ = vm_util.IssueCommand(describe) + current = ( + json.loads(out)['cluster'].get('logging', {}).get('clusterLogging', []) + ) + any_enabled = any(e.get('enabled', False) for e in current) + payload = json.dumps({ + 'clusterLogging': [ + {'types': log_types, 'enabled': not any_enabled} + ] + }) + upd = util.AWS_PREFIX + [ + 'eks', + 'update-cluster-config', + '--name', + self.name, + '--region', + self.region, + '--logging', + payload, + ] + # Wait for cluster ACTIVE before firing update — at 99-pool scale + # Scenario A leaves the cluster UPDATING causing ResourceInUseException. + logging.info('[EKS] Waiting for cluster ACTIVE before ClusterUpdate...') + for _ in range(60): + status_out, _, status_rc = vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'eks', 'describe-cluster', + '--name', self.name, + '--region', self.region, + '--query', 'cluster.status', + '--output', 'text', + ], + raise_on_failure=False, + ) + if status_rc == 0 and status_out.strip() == 'ACTIVE': + logging.info('[EKS] Cluster is ACTIVE — proceeding with ClusterUpdate') + break + logging.info('[EKS] Cluster status=%s — waiting 5s...', status_out.strip()) + time.sleep(5) + # Retry on ResourceInUseException race condition + upd_max_retries = 10 + upd_base_delay = 30 + for upd_attempt in range(upd_max_retries): + stdout, stderr, retcode = vm_util.IssueCommand( + upd, timeout=300, raise_on_failure=False + ) + if retcode == 0: + break + if 'ResourceInUseException' in stderr and upd_attempt < upd_max_retries - 1: + delay = upd_base_delay * (upd_attempt + 1) + logging.warning( + '[EKS] UpdateClusterConfig ResourceInUseException — retry %d/%d in %ds', + upd_attempt + 1, upd_max_retries, delay, + ) + time.sleep(delay) + continue + raise errors.Resource.CreationError(stderr) + update_id = json.loads(stdout)['update']['id'] + return f'cluster_update:{update_id}' + + def ResolveNodePoolVersions(self) -> tuple[str, str]: + """Returns (initial, target) EKS nodegroup versions. + + Uses cluster_version (already set from FLAGS/describe-cluster) rather than + querying kubectl, which is faster and avoids a kubectl round-trip. + initial = N-1 (adjacent minor below cluster version) + target = N (cluster version = latest) + """ + cluster_ver = self.cluster_version or self.k8s_version + # Strip any patch suffix e.g. '1.34.7' -> '1.34' + parts = cluster_ver.lstrip('v').split('.') + major, minor = int(parts[0]), int(parts[1]) + target = f'{major}.{minor}' + initial = f'{major}.{minor - 1}' + logging.info( + '[EKS] ResolveNodePoolVersions: cluster=%s initial=%s target=%s', + cluster_ver, initial, target, + ) + return initial, target + + def WaitForOperation(self, op_handle: str) -> None: + """Polls EKS resources until the expected terminal state is observed.""" + kind, _, name = op_handle.partition(':') + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _wait_ng_active(): + out, err, rc = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'describe-nodegroup', + '--cluster-name', + self.name, + '--nodegroup-name', + name, + '--region', + self.region, + ], + raise_on_failure=False, + ) + if rc: + raise errors.Resource.RetryableCreationError(err) + status = json.loads(out)['nodegroup']['status'] + if status in ('ACTIVE',): + return + if status in ('CREATE_FAILED', 'DELETE_FAILED', 'DEGRADED'): + raise errors.Resource.CreationError( + f'nodegroup {name} ended in {status}' + ) + raise errors.Resource.RetryableCreationError( + f'nodegroup {name} status={status}' + ) + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableDeletionError,), + ) + def _wait_ng_gone(): + _, err, rc = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'describe-nodegroup', + '--cluster-name', + self.name, + '--nodegroup-name', + name, + '--region', + self.region, + ], + raise_on_failure=False, + ) + if rc and 'ResourceNotFoundException' in (err or ''): + return + if rc: + raise errors.Resource.RetryableDeletionError(err) + raise errors.Resource.RetryableDeletionError( + f'nodegroup {name} still present' + ) + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _wait_cluster_update(): + out, err, rc = vm_util.IssueCommand( + util.AWS_PREFIX + + [ + 'eks', + 'describe-update', + '--name', + self.name, + '--update-id', + name, + '--region', + self.region, + '--query', + 'update.status', + '--output', + 'text', + ], + raise_on_failure=False, + ) + if rc: + raise errors.Resource.RetryableCreationError(err) + status = out.strip() + if status == 'Successful': + return + if status in ('Failed', 'Cancelled'): + raise errors.Resource.CreationError( + f'cluster update {name} ended in {status}' + ) + raise errors.Resource.RetryableCreationError( + f'cluster update {name} status={status}' + ) + + if kind == 'ng_active': + _wait_ng_active() + elif kind == 'ng_gone': + _wait_ng_gone() + elif kind == 'cluster_update': + _wait_cluster_update() + else: + raise ValueError(f'Unknown EKS op handle: {op_handle!r}') + + def UpdateCluster(self) -> None: + """Real cluster-level update via a CloudWatch logging toggle. + + Reads the current cluster logging state, flips it (enable->disable or + vice versa), and waits for the cluster to return to ACTIVE. Enabling all + five log types is a 5-10 minute control-plane op, giving a meaningful + overlap window for Scenario B. + """ + log_types = ['api', 'audit', 'authenticator', 'controllerManager', + 'scheduler'] + describe = util.AWS_PREFIX + [ + 'eks', 'describe-cluster', + '--name', self.name, + '--region', self.region, + ] + stdout, _, _ = vm_util.IssueCommand(describe) + info = json.loads(stdout) + current = info['cluster'].get('logging', {}).get('clusterLogging', []) + any_enabled = any(entry.get('enabled', False) for entry in current) + new_enabled = not any_enabled + logging_payload = json.dumps({ + 'clusterLogging': [ + {'types': log_types, 'enabled': new_enabled} + ] + }) + update = util.AWS_PREFIX + [ + 'eks', 'update-cluster-config', + '--name', self.name, + '--region', self.region, + '--logging', logging_payload, + ] + vm_util.IssueCommand(update, timeout=900) + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=900, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _wait_active(): + query = util.AWS_PREFIX + [ + 'eks', 'describe-cluster', + '--name', self.name, + '--region', self.region, + '--query', 'cluster.status', + '--output', 'text', + ] + out, _, _ = vm_util.IssueCommand(query) + status = out.strip() + if status != 'ACTIVE': + raise errors.Resource.RetryableCreationError( + f'cluster status={status}' + ) + + _wait_active() + class EksAutoCluster(BaseEksCluster): """Class representing an Elastic Kubernetes Service cluster with auto mode. @@ -542,7 +1495,7 @@ class EksAutoCluster(BaseEksCluster): def __init__(self, spec): super().__init__(spec) self._ChooseSecondZone() - is_rare_gpu = self.gpu_type in _RARE_GPU_TYPES + is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu def _Create(self): @@ -574,6 +1527,34 @@ def _PostCreate(self): def _Delete(self): """Deletes the control plane and worker nodes.""" + # Clean up SSH key pair — safety net in case _DeleteDependencies didn't run + try: + aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region) + except Exception: # pylint: disable=broad-except + pass + # Clean up dynamically created launch templates and capacity reservations + # Only runs if capacity reservations were actually created this run. + if getattr(FLAGS, 'eks_reserve_capacity_per_az', False): + for az in getattr(self, '_capacity_reservation_ids', {}).keys(): + vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'delete-launch-template', + '--launch-template-name', f'pkb-eks-lt-{az}', + '--region', self.region, + ], + raise_on_failure=False, + ) + logging.info('[EKS] Deleted launch template pkb-eks-lt-%s', az) + for az, res_id in getattr(self, '_capacity_reservation_ids', {}).items(): + vm_util.IssueCommand( + util.AWS_PREFIX + [ + 'ec2', 'cancel-capacity-reservation', + '--capacity-reservation-id', res_id, + '--region', self.region, + ], + raise_on_failure=False, + ) + logging.info('[EKS] Cancelled capacity reservation %s in %s', res_id, az) super()._Delete() cmd = [ FLAGS.eksctl, @@ -607,14 +1588,15 @@ def ResizeNodePool( def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]: """Get the node selectors section of a yaml for the provider.""" del machine_type # Unused. - # Theoretically needed in mixed mode, but deployments fail without it: - # https://docs.aws.amazon.com/eks/latest/userguide/associate-workload.html#_require_a_workload_is_deployed_to_eks_auto_mode_nodes + # Theoretically needed in mixed mode, but deployments fail without it. + # See: docs.aws.amazon.com/eks/latest/userguide/associate-workload.html + # #_require_a_workload_is_deployed_to_eks_auto_mode_nodes selectors = {'eks.amazonaws.com/compute-type': 'auto'} if self.use_spot: selectors['karpenter.sh/capacity-type'] = 'spot' - if self.gpu_type: + if virtual_machine.GPU_TYPE.value: selectors['eks.amazonaws.com/instance-gpu-name'] = ( - self.gpu_type + virtual_machine.GPU_TYPE.value ) return selectors @@ -646,10 +1628,15 @@ def __init__(self, spec): def _Create(self): """Creates the control plane and worker nodes.""" template_filename = vm_util.PrependTempDir('cloud-formation-template.yaml') + cfn_url = ( + 'https://raw.githubusercontent.com/aws/karpenter-provider-aws/' + + f'v{_KARPENTER_VERSION}/website/content/en/preview/' + + 'getting-started/getting-started-with-karpenter/' + + 'cloudformation.yaml') vm_util.IssueCommand([ 'curl', '-fsSL', - f'https://raw.githubusercontent.com/aws/karpenter-provider-aws/v{_KARPENTER_VERSION}/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml', + cfn_url, '-o', template_filename, ]) @@ -681,6 +1668,12 @@ def _Create(self): bootstrapping_nodepool.min_nodes = 1 bootstrapping_nodepool.max_nodes = 1 bootstrapping_nodepool.machine_type = 'm7i.2xlarge' + karpenter_policy_arn = ( + f'arn:aws:iam::{self.account}:policy/' + + f'KarpenterControllerPolicy-{self.name}') + karpenter_node_role_arn = ( + f'arn:aws:iam::{self.account}:role/' + + f'KarpenterNodeRole-{self.name}') create_json: dict[str, Any] = { 'metadata': { 'tags': {'karpenter.sh/discovery': self.name}, @@ -691,14 +1684,12 @@ def _Create(self): 'serviceAccountName': 'karpenter', 'roleName': f'{self.name}-karpenter', 'permissionPolicyARNs': [ - f'arn:aws:iam::{self.account}:policy/KarpenterControllerPolicy-{self.name}' + karpenter_policy_arn ], }], }, 'iamIdentityMappings': [{ - 'arn': ( - f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}' - ), + 'arn': karpenter_node_role_arn, 'username': 'system:node:{{EC2PrivateDNSName}}', 'groups': ['system:bootstrappers', 'system:nodes'], }], @@ -739,15 +1730,16 @@ def _InstallAwsLoadBalancerController(self) -> None: policy_arn = (stdout or '').strip() if not policy_arn or policy_arn == 'None': with vm_util.NamedTemporaryFile(dir=vm_util.GetTempDir(), mode='w') as tf: + alb_policy_url = ( + 'https://raw.githubusercontent.com/kubernetes-sigs/' + + 'aws-load-balancer-controller/' + + 'v2.13.4/docs/install/iam_policy.json') vm_util.IssueCommand([ 'curl', '-sSL', '-o', tf.name, - ( - 'https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/' - 'v2.13.4/docs/install/iam_policy.json' - ), + alb_policy_url, ]) stdout, _, _ = vm_util.IssueCommand( util.AWS_PREFIX @@ -788,11 +1780,14 @@ def _InstallAwsLoadBalancerController(self) -> None: in stderr, ) # 4) Apply CRDs + crds_url = ( + 'https://raw.githubusercontent.com/aws/eks-charts/master/' + + 'stable/aws-load-balancer-controller/crds/crds.yaml') kubectl.RunKubectlCommand( [ 'apply', '-f', - 'https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml', + crds_url, ], suppress_failure=lambda stdout, stderr, retcode: 'already exists' in stderr, @@ -883,7 +1878,8 @@ def _WaitForIngress(self, name: str, namespace: str, port: int) -> str: def _PostIngressNetworkingFixups( self, namespace: str, name: str, port: int, address: str ) -> None: - """Fixs ALB -> nodes connectivity to prevent 504 errors from unhealthy targets.""" + """Fixes ALB -> node connectivity to prevent 504 errors.""" + del namespace, name # Unused # 1) Get ALB security group from address host = ( @@ -1008,7 +2004,7 @@ def _PostCreate(self): 'daemonset/aws-node', '-n', 'kube-system', - '--timeout=%ds' % vm_util.DEFAULT_TIMEOUT, + f'--timeout={vm_util.DEFAULT_TIMEOUT}s', ], timeout=vm_util.DEFAULT_TIMEOUT, ) @@ -1093,12 +2089,15 @@ def _PostCreate(self): # Get the AMI version for current kubernetes version. # See e.g. https://karpenter.sh/docs/tasks/managing-amis/ for not using # @latest. + ssm_ami_path = ( + f'/aws/service/eks/optimized-ami/{self.cluster_version}/' + + 'amazon-linux-2023/x86_64/standard/recommended/image_id') image_id, _, _ = vm_util.IssueCommand([ 'aws', 'ssm', 'get-parameter', '--name', - f'/aws/service/eks/optimized-ami/{self.cluster_version}/amazon-linux-2023/x86_64/standard/recommended/image_id', + ssm_ami_path, '--region', self.region, '--query', @@ -1219,7 +2218,7 @@ def _DeleteDependencies(self): else: logging.info( 'Karpenter node role %s not found or empty response; skipping' - ' instance profile cleanup', + + ' instance profile cleanup', node_role, ) profiles_json = {'InstanceProfiles': []} @@ -1371,7 +2370,7 @@ def _CleanupKarpenter(self): for eni_id in eni_ids: # Bind eni_id by default to avoid loop closure issues if # this is refactored. - def _DeleteOneEni(eni_id=eni_id) -> None: + def _delete_one_eni(eni_id=eni_id) -> None: _, stderr, retcode = vm_util.IssueCommand( [ 'aws', @@ -1402,7 +2401,7 @@ def _DeleteOneEni(eni_id=eni_id) -> None: poll_interval=10, max_retries=5, retryable_exceptions=(errors.Resource.RetryableDeletionError,), - )(_DeleteOneEni)() + )(_delete_one_eni)() def _IsReady(self): """Returns True if cluster is running. Autopilot defaults to 0 nodes.""" diff --git a/perfkitbenchmarker/providers/aws/flags.py b/perfkitbenchmarker/providers/aws/flags.py index 6871a085e5..b7f6ca214c 100644 --- a/perfkitbenchmarker/providers/aws/flags.py +++ b/perfkitbenchmarker/providers/aws/flags.py @@ -376,3 +376,22 @@ def _ValidatePreprovisionedDataAccess(flag_values: dict[str, Any]) -> bool: None, 'If supplied, creates the DocumentDB instance from the snapshot.', ) + +# Flag to skip EBS CSI driver setup during EKS cluster creation. +# Safe for benchmarks that do not use persistent volumes (e.g. k8s_management). +# Saves ~3 minutes per run. +flags.DEFINE_boolean( + 'eks_reserve_capacity_per_az', + False, + 'If True, dynamically creates EC2 capacity reservations and launch ' + 'templates per AZ before nodegroup creation. Enable only for the ' + 'k8s_management benchmark. Leaving enabled for other benchmarks ' + 'wastes reserved capacity on wrong instance types.', +) +flags.DEFINE_boolean( + 'eks_skip_ebs_csi', + False, + 'If True, skip EBS CSI driver setup (OIDC + IAM role + addon install) ' + 'during EKS cluster creation. Safe for the k8s_management benchmark ' + 'which does not use persistent volumes. Saves ~3 minutes per run.', +) diff --git a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py index 5d9bbc222b..4ce6174edd 100644 --- a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py +++ b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py @@ -15,9 +15,11 @@ """Contains classes/functions related to Azure Kubernetes Service.""" import json +import time from typing import Any, List from absl import flags +from absl import logging from perfkitbenchmarker import errors from perfkitbenchmarker import provider_info from perfkitbenchmarker import virtual_machine @@ -154,8 +156,7 @@ def GetResourceMetadata(self): def _IsAutoscalerEnabled(self, nodepool_config: container.BaseNodePoolConfig): """Returns True if the cluster autoscaler is enabled.""" return ( - nodepool_config.min_nodes - != nodepool_config.max_nodes + nodepool_config.min_nodes != nodepool_config.max_nodes # Auto node provisioning mode is incompatible with cluster autoscaler. ) and not FLAGS.azure_aks_auto_node_provisioning @@ -539,6 +540,393 @@ def AddNodepool(self, batch_name, pool_id): spot=FLAGS.azure_low_priority_vms, ) + def CreateNodePool( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> None: + """Creates a single named node pool on the cluster.""" + node_flags = self._GetNodeFlags(nodepool_config) + if node_version: + # _GetNodeFlags may have added self.cluster_version; replace or append. + if '--kubernetes-version' in node_flags: + node_flags[node_flags.index('--kubernetes-version') + 1] = node_version + else: + node_flags += ['--kubernetes-version', node_version] + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'add', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(nodepool_config.name), + '--labels', + f'pkb_nodepool={nodepool_config.name}', + ] + node_flags + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=1800, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + + def DeleteNodePool(self, name: str) -> None: + """Deletes the named node pool.""" + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'delete', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(name), + ] + self.resource_group.args + self._RunCreateClusterCmd(cmd) + + def UpgradeNodePool(self, name: str, target_version: str) -> None: + """Upgrades the named node pool to target_version.""" + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'upgrade', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(name), + '--kubernetes-version', + target_version, + ] + self.resource_group.args + vm_util.IssueCommand(cmd, timeout=1800) + + def UpdateCluster(self) -> None: + """Real cluster-level update via a unique-timestamp tag change. + + Triggers a control-plane operation (cluster-scoped, not pool-scoped) by + updating the cluster tags. Always succeeds because the tag value changes + every call. + """ + cmd = [ + azure.AZURE_PATH, + 'aks', + 'update', + '--name', + self.name, + '--tags', + f'k8s-mgmt-ts={int(time.time())}', + ] + self.resource_group.args + vm_util.IssueCommand(cmd, timeout=1800) + + # ---- Async variants (return opaque handles) ------------------------------- + + def CreateNodePoolAsync( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> str: + node_flags = self._GetNodeFlags(nodepool_config) + if node_version: + # _GetNodeFlags may have added self.cluster_version; replace or append. + if '--kubernetes-version' in node_flags: + node_flags[node_flags.index('--kubernetes-version') + 1] = node_version + else: + node_flags += ['--kubernetes-version', node_version] + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'add', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(nodepool_config.name), + '--labels', + f'pkb_nodepool={nodepool_config.name}', + '--no-wait', + ] + node_flags + # fix: raise timeout to 600s (AKS can take >300s to accept a + # --no-wait request under concurrent load) and retry on transient errors + # that indicate the cluster is temporarily at its concurrent-op or + # pool-count limit. + _RETRYABLE = ( + 'OperationNotAllowed', + 'ConflictingOperationInProgress', + 'MaxAgentPoolCountReached', + ) + _MAX_RETRIES = 5 + _RETRY_SLEEP_S = 30 + for attempt in range(_MAX_RETRIES + 1): + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=600, raise_on_failure=False + ) + if not retcode: + break + if attempt < _MAX_RETRIES and any(e in stderr for e in _RETRYABLE): + logging.warning( + '[AKS] CreateNodePoolAsync %s: retryable error (attempt %d/%d),' + ' sleeping %ds: %s', + _AzureNodePoolName(nodepool_config.name), + attempt + 1, _MAX_RETRIES, _RETRY_SLEEP_S, stderr[:120], + ) + time.sleep(_RETRY_SLEEP_S) + continue + raise errors.Resource.CreationError(stderr) + return f'np_succeeded:{_AzureNodePoolName(nodepool_config.name)}' + + def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str: + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'upgrade', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(name), + '--kubernetes-version', + target_version, + '--no-wait', + ] + self.resource_group.args + # fix: raise timeout to 600s — az aks nodepool upgrade --no-wait + # can take >300s to be accepted by Azure under concurrent load. + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=600, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + return f'np_succeeded:{_AzureNodePoolName(name)}' + + def DeleteNodePoolAsync(self, name: str) -> str: + cmd = [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'delete', + '--cluster-name', + self.name, + '--name', + _AzureNodePoolName(name), + '--no-wait', + ] + self.resource_group.args + # fix: raise timeout to 600s and treat NotFound as success. + # A pool that never existed or was already removed is the desired end-state + # for a delete — raising CreationError here caused all delete phases to + # fail for any pool whose create had previously failed. + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=600, raise_on_failure=False + ) + if retcode: + if 'NotFound' in stderr or 'not found' in stderr.lower(): + logging.info( + '[AKS] DeleteNodePoolAsync: %s already gone — treating as success', + _AzureNodePoolName(name), + ) + return f'np_gone:{_AzureNodePoolName(name)}' + raise errors.Resource.CreationError(stderr) + return f'np_gone:{_AzureNodePoolName(name)}' + + def UpdateClusterAsync(self) -> str: + """Triggers a node-count scale on the system node pool to create a + long-running cluster update for Scenario B overlap testing. + + Scaling the system pool by ±1 node takes 3-8 minutes on AKS, which + creates a meaningful overlap window for the concurrent NodePool create. + The scale alternates +1/-1 each call so it is always a real change. + Falls back to a tag update if the system pool cannot be identified. + """ + # Find the system node pool name + list_cmd = [ + azure.AZURE_PATH, 'aks', 'nodepool', 'list', + '--cluster-name', self.name, + '--query', '[?mode==`System`].{name:name,count:count}', + '--output', 'json', + ] + self.resource_group.args + out, _, rc = vm_util.IssueCommand(list_cmd, raise_on_failure=False) + if not rc and out.strip(): + try: + pools = json.loads(out.strip()) + if pools: + pool_name = pools[0]['name'] + current_count = int(pools[0]['count']) + # Toggle: scale to current+1 or current-1 (minimum 1) + new_count = current_count + 1 if current_count <= 1 else current_count - 1 + scale_cmd = [ + azure.AZURE_PATH, 'aks', 'nodepool', 'scale', + '--cluster-name', self.name, + '--name', pool_name, + '--node-count', str(new_count), + '--no-wait', + ] + self.resource_group.args + _, stderr, retcode = vm_util.IssueCommand( + scale_cmd, timeout=300, raise_on_failure=False + ) + if not retcode: + logging.info( + '[AKS] UpdateClusterAsync: scaling system pool %s %d->%d', + pool_name, current_count, new_count, + ) + return 'cluster_succeeded' + except (ValueError, KeyError, json.JSONDecodeError) as e: + logging.warning('[AKS] UpdateClusterAsync: pool parse error: %s', e) + # Fallback: tag update + logging.warning('[AKS] UpdateClusterAsync: falling back to tag update') + cmd = [ + azure.AZURE_PATH, 'aks', 'update', + '--name', self.name, + '--tags', f'k8s-mgmt-ts={int(time.time())}', + '--no-wait', + ] + self.resource_group.args + _, stderr, retcode = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) + if retcode: + raise errors.Resource.CreationError(stderr) + return 'cluster_succeeded' + + def ResolveNodePoolVersions(self) -> tuple[str, str]: + """Returns (initial, target) AKS node pool versions. + + Uses cluster_version (already set) rather than querying kubectl. + initial = N-1 (adjacent minor below cluster version) + target = N (cluster version = latest) + """ + cluster_ver = self.cluster_version or self.k8s_version + parts = cluster_ver.lstrip('v').split('.') + major, minor = int(parts[0]), int(parts[1]) + target = f'{major}.{minor}' + initial = f'{major}.{minor - 1}' + logging.info( + '[AKS] ResolveNodePoolVersions: cluster=%s initial=%s target=%s', + cluster_ver, initial, target, + ) + return initial, target + + def WaitForOperation(self, op_handle: str) -> None: + """Polls AKS resources until the expected terminal state is observed.""" + kind, _, name = op_handle.partition(':') + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _wait_np_succeeded(): + # fix: bound each individual poll call to 120s so a hung + # az aks nodepool show doesn't block the retry loop indefinitely. + out, err, rc = vm_util.IssueCommand( + [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'show', + '--cluster-name', + self.name, + '--name', + name, + '--query', + 'provisioningState', + '--output', + 'tsv', + ] + + self.resource_group.args, + raise_on_failure=False, + timeout=120, + ) + if rc: + if 'NotFound' in (err or '') or 'not found' in (err or '').lower(): + raise errors.Resource.CreationError( + f'nodepool {name} not found while waiting for Succeeded: {err}' + ) + raise errors.Resource.RetryableCreationError(err) + status = out.strip() + if status == 'Succeeded': + return + if status == 'Failed': + raise errors.Resource.CreationError( + f'nodepool {name} ended in Failed' + ) + raise errors.Resource.RetryableCreationError( + f'nodepool {name} state={status}' + ) + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableDeletionError,), + ) + def _wait_np_gone(): + # fix: per-poll timeout bound. + _, err, rc = vm_util.IssueCommand( + [ + azure.AZURE_PATH, + 'aks', + 'nodepool', + 'show', + '--cluster-name', + self.name, + '--name', + name, + ] + + self.resource_group.args, + raise_on_failure=False, + timeout=120, + ) + if rc and ('NotFound' in (err or '') or 'not found' in (err or '').lower()): + return + if rc: + raise errors.Resource.RetryableDeletionError(err) + raise errors.Resource.RetryableDeletionError( + f'nodepool {name} still present' + ) + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=3600, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _wait_cluster_succeeded(): + # fix: per-poll timeout bound. + out, err, rc = vm_util.IssueCommand( + [ + azure.AZURE_PATH, + 'aks', + 'show', + '--name', + self.name, + '--query', + 'provisioningState', + '--output', + 'tsv', + ] + + self.resource_group.args, + raise_on_failure=False, + timeout=120, + ) + if rc: + raise errors.Resource.RetryableCreationError(err) + status = out.strip() + if status == 'Succeeded': + return + if status == 'Failed': + raise errors.Resource.CreationError('cluster update ended in Failed') + raise errors.Resource.RetryableCreationError( + f'cluster state={status}' + ) + + if kind == 'np_succeeded': + _wait_np_succeeded() + elif kind == 'np_gone': + _wait_np_gone() + elif kind == 'cluster_succeeded': + _wait_cluster_succeeded() + else: + raise ValueError(f'Unknown AKS op handle: {op_handle!r}') + class AksAutomaticCluster(AksCluster): """Class representing an AKS Automatic cluster, which has managed node pools. diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index 6b0076aa69..76bd8afb97 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -18,13 +18,16 @@ import math import os import re +import time import typing from typing import Any from absl import flags from perfkitbenchmarker import errors from perfkitbenchmarker import provider_info +from perfkitbenchmarker import virtual_machine from perfkitbenchmarker import virtual_machine_spec +from perfkitbenchmarker import vm_util from perfkitbenchmarker.configs import container_spec as container_spec_lib from perfkitbenchmarker.providers.gcp import flags as gcp_flags from perfkitbenchmarker.providers.gcp import gce_disk @@ -52,8 +55,8 @@ def _CalculateCidrSize(nodes: int) -> int: # So 2^(32 - nodes) - 2^(32 - 20) >= 2^(32 - 24) * CIDR # OR CIDR <= 32 - log2(2^8 * nodes + 2^12) cidr_size = int(32 - math.log2((nodes << 8) + (1 << 12))) - # /19 is narrowest CIDR range GKE supports - return min(cidr_size, 19) + # /16 is narrowest CIDR range GKE supports + return min(cidr_size, 16) class GoogleArtifactRegistry(container_registry.BaseContainerRegistry): @@ -259,10 +262,13 @@ def GetNodePoolNames(self) -> list[str]: # Command `gcloud container node-pools list` does not work for Autopilot # clusters - node pools are hidden and command results in 4xx. cmd = self._GcloudCommand('container', 'clusters', 'describe', self.name) - cmd.flags['flatten'] = 'nodePools' - cmd.flags['format'] = 'value(nodePools.name)' + cmd.flags['format'] = 'json' stdout, _, _ = cmd.Issue() - return stdout.split() + try: + cluster_info = json.loads(stdout) + return [np['name'] for np in cluster_info.get('nodePools', [])] + except (json.JSONDecodeError, ValueError, KeyError, TypeError): + return stdout.split() def GetMachineTypeFromNodeName(self, node_name: str) -> str | None: """Get the machine type from the node name.""" @@ -325,6 +331,8 @@ def InitializeNodePoolForCloud( nodepool_config.disk_size = vm_config.boot_disk_size nodepool_config.max_local_disks = vm_config.max_local_disks nodepool_config.ssd_interface = vm_config.ssd_interface + nodepool_config.gpu_type = vm_config.gpu_type + nodepool_config.gpu_count = vm_config.gpu_count nodepool_config.threads_per_core = vm_config.threads_per_core nodepool_config.gce_tags = vm_config.gce_tags nodepool_config.min_cpu_platform = vm_config.min_cpu_platform @@ -360,6 +368,9 @@ def GetResourceMetadata(self) -> dict[str, Any]: result['gce_local_ssd_count'] = self.default_nodepool.max_local_disks result['gce_local_ssd_interface'] = self.default_nodepool.ssd_interface result['gke_nccl_fast_socket'] = self.enable_nccl_fast_socket + if 'nccl' in self.nodepools: + result['gpu_type'] = self.nodepools['nccl'].gpu_type + result['gpu_count'] = self.nodepools['nccl'].gpu_count if self.image_type: result['image_type'] = self.image_type if gcp_flags.MAX_CPU.value: @@ -633,6 +644,339 @@ def ResizeNodePool( cmd.flags['node-pool'] = node_pool cmd.Issue() + def CreateNodePool( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> None: + """Creates a single named node pool on the cluster.""" + cmd = self._GcloudCommand( + 'container', + 'node-pools', + 'create', + nodepool_config.name, + '--cluster', + self.name, + ) + self._AddNodeParamsToCmd(nodepool_config, cmd) + if node_version: + cmd.flags['node-version'] = node_version + self._IssueResourceCreationCommand(cmd) + + def DeleteNodePool(self, name: str) -> None: + """Deletes the named node pool.""" + cmd = self._GcloudCommand( + 'container', + 'node-pools', + 'delete', + name, + '--cluster', + self.name, + ) + cmd.args.append('--quiet') + cmd.Issue(timeout=ONE_HOUR) + + def UpgradeNodePool(self, name: str, target_version: str) -> None: + """Upgrades the named node pool to target_version.""" + cmd = self._GcloudCommand( + 'container', + 'clusters', + 'upgrade', + self.name, + '--node-pool', + name, + '--cluster-version', + target_version, + ) + cmd.args.append('--quiet') + cmd.Issue(timeout=ONE_HOUR) + + def UpdateCluster(self) -> None: + """Real cluster-level update via a unique-timestamp label change. + + Triggers an actual control-plane operation (cluster-level, not nodepool) + without destructively altering cluster configuration. Always succeeds + because the label value changes every call. + """ + cmd = self._GcloudCommand('container', 'clusters', 'update', self.name) + cmd.flags['update-labels'] = f'k8s-mgmt-ts={int(time.time())}' + cmd.Issue(timeout=ONE_HOUR) + + # ---- Async variants (return opaque handles) ------------------------------- + + def _IssueAsync(self, cmd: util.GcloudCommand) -> str: + """Issues a gcloud command with --async, returns the operation name.""" + cmd.args.append('--async') + cmd.flags['format'] = 'value(name)' + stdout, stderr, retcode = cmd.Issue(timeout=600, raise_on_failure=False) + if retcode: + raise errors.Resource.CreationError(stderr) + op_name = stdout.strip().splitlines()[-1].strip() if stdout else '' + if not op_name: + raise errors.Resource.CreationError( + f'GKE async command returned no operation name; stderr={stderr}' + ) + return op_name + + def _GetLatestOperationName( + self, + operation_type: str = 'UPGRADE_NODES', + target_name: str = '', + max_attempts: int = 5, + retry_delay: int = 3, + op_start_time: float = 0.0, + ) -> str: + """Returns the name of the most recent matching operation for this cluster. + + The async gcloud command may return before the GKE control plane has + transitioned the operation from PENDING to RUNNING. For fast operations + (e.g. label updates) the operation may already be DONE by the time this + method is called. Passing op_start_time handles both cases. + + Args: + operation_type: GKE operationType to filter on, e.g. 'UPGRADE_NODES' + for node pool upgrades or 'UPDATE_CLUSTER' for cluster-level + updates via 'gcloud container clusters update'. + target_name: Substring to match against targetLink (e.g. nodepool name + for UPGRADE_NODES, or cluster name for UPDATE_CLUSTER). If empty, + falls back to self.name (the cluster name). + max_attempts: Number of query attempts before giving up. + retry_delay: Seconds to wait between attempts. + op_start_time: Unix timestamp recorded just before the async gcloud + command was issued. When provided, the status filter is broadened + to include DONE (so fast-completing operations are found) and a + startTime >= guard is added to avoid matching old operations. + + Returns: + Operation name string, or empty string if none found. + """ + link_target = target_name or self.name + if op_start_time: + # Fast operations (e.g. --update-labels) may be DONE before we query. + # Broaden the status filter and add a startTime guard (with a 30-second + # buffer for clock skew) to avoid picking up older completed operations. + from_time = time.strftime( + '%Y-%m-%dT%H:%M:%SZ', time.gmtime(op_start_time - 30) + ) + status_filter = '(status=RUNNING OR status=PENDING OR status=DONE)' + time_filter = f' AND startTime>="{from_time}"' + else: + # Slow operations (e.g. node pool upgrades): only look for active ops. + status_filter = '(status=RUNNING OR status=PENDING)' + time_filter = '' + + filter_str = ( + f'operationType={operation_type} AND ' + f'{status_filter} AND ' + f'targetLink ~ {link_target}' + f'{time_filter}' + ) + for attempt in range(1, max_attempts + 1): + list_cmd = self._GcloudCommand('container', 'operations', 'list') + list_cmd.flags['filter'] = filter_str + list_cmd.flags['sort-by'] = '~startTime' + list_cmd.flags['limit'] = 1 + list_cmd.flags['format'] = 'value(name)' + stdout, stderr, _ = list_cmd.Issue(raise_on_failure=False) + op_name = stdout.strip() + if op_name: + logging.info( + '_GetLatestOperationName: found op %s (type=%s target=%s) ' + '(attempt %d/%d)', op_name, operation_type, link_target, + attempt, max_attempts, + ) + return op_name + logging.warning( + '_GetLatestOperationName: no %s op found for target=%s ' + '(attempt %d/%d), retrying in %ds. stderr=%s', + operation_type, link_target, attempt, max_attempts, retry_delay, + stderr, + ) + time.sleep(retry_delay) + return '' + +# def HasActiveUpgradeOperations(self) -> bool: +# """Checks if there are any active node pool upgrades running on the cluster.""" +# cmd = self._GcloudCommand('container', 'operations', 'list') +# cmd.flags['project'] = self.project +# cmd.flags['zone'] = self.zone +# cmd.flags['filter'] = 'operationType=UPGRADE_NODES AND status=RUNNING' +# cmd.flags['sort-by'] = '~startTime' +# cmd.flags['limit'] = 1 +# cmd.flags['format'] = 'value(name)' + + # Issue the command using PKB's native GcloudCommand wrapper + stdout, _, _ = cmd.Issue(raise_on_failure=False) + return bool(stdout.strip()) + + def CreateNodePoolAsync( + self, + nodepool_config: container.BaseNodePoolConfig, + node_version: str | None = None, + ) -> str: + cmd = self._GcloudCommand( + 'container', + 'node-pools', + 'create', + nodepool_config.name, + '--cluster', + self.name, + ) + self._AddNodeParamsToCmd(nodepool_config, cmd) + if node_version: + cmd.flags['node-version'] = node_version + # --async is incompatible with the long --timeout flag in some gcloud + # builds; remove it so the CLI just hands back the op name immediately. + cmd.flags.pop('timeout', None) + return self._IssueAsync(cmd) + + def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str: + cmd = self._GcloudCommand( + 'container', + 'clusters', + 'upgrade', + self.name, + '--node-pool', + name, + '--cluster-version', + target_version, + ) + try: + return self._IssueAsync(cmd) + except errors.Resource.CreationError as e: + if 'returned no operation name' not in str(e): + raise + # Fallback: gcloud succeeded but printed nothing. Query the operations + # list scoped to this specific nodepool to find the operation name. + logging.warning( + 'UpgradeNodePoolAsync: falling back to operations list for ' + 'nodepool %s. Original error: %s', name, e + ) + op_name = self._GetLatestOperationName( + operation_type='UPGRADE_NODES', target_name=name + ) + if not op_name: + raise + return op_name + + def DeleteNodePoolAsync(self, name: str) -> str: + cmd = self._GcloudCommand( + 'container', + 'node-pools', + 'delete', + name, + '--cluster', + self.name, + ) + cmd.args.append('--quiet') + return self._IssueAsync(cmd) + + def UpdateClusterAsync(self) -> str: + cmd = self._GcloudCommand('container', 'clusters', 'update', self.name) + cmd.flags['update-labels'] = f'k8s-mgmt-ts={int(time.time())}' + # 'gcloud container clusters update --async' suppresses stdout when + # --quiet is active (same behaviour as 'clusters upgrade'), so the + # operation name is never printed. Remove --quiet here; the label-update + # is non-interactive so no confirmation prompt is needed. + cmd.flags.pop('quiet', None) + # Record start time BEFORE issuing. The label-update operation completes + # in seconds, so it may already be DONE by the time the fallback queries + # the operations list. The timestamp lets us safely include DONE ops + # without matching older completed operations from previous runs. + op_start_time = time.time() + try: + return self._IssueAsync(cmd) + except errors.Resource.CreationError as e: + if 'returned no operation name' not in str(e): + raise + # Fallback: gcloud returned retcode=0 but empty stdout. Query the + # operations list including DONE status (fast label-update ops complete + # before we query) guarded by op_start_time to avoid stale matches. + logging.warning( + 'UpdateClusterAsync: falling back to operations list for cluster %s.' + ' Original error: %s', self.name, e + ) + op_name = self._GetLatestOperationName( + operation_type='UPDATE_CLUSTER', + target_name=self.name, + op_start_time=op_start_time, + ) + if not op_name: + raise + return op_name + + def ResolveNodePoolVersions(self) -> tuple[str, str]: + """Returns (initial, target) GKE node versions: initial=N-1, target=N. + + GKE requires fully-qualified node versions (e.g. '1.34.4-gke.1234'), + so we query `gcloud container get-server-config` and pick the newest + valid version per minor. + """ + cmd = self._GcloudCommand('container', 'get-server-config') + cmd.flags['format'] = 'json' + stdout, stderr, retcode = cmd.Issue(raise_on_failure=False) + if retcode: + raise errors.Resource.GetError( + f'gcloud get-server-config failed: {stderr}' + ) + config = json.loads(stdout) + valid = list(config.get('validNodeVersions', [])) + if not valid: + raise errors.Resource.GetError( + 'GKE get-server-config returned no validNodeVersions' + ) + + def _version_tuple(v): + return tuple(int(x) for x in v.split('-', 1)[0].split('.')) + + valid.sort(key=_version_tuple, reverse=True) + target = valid[0] + initial_minor = kubernetes_cluster.AdjacentMinorBelow(target) + for v in valid: + if kubernetes_cluster.BareMinor(v) == initial_minor: + return v, target + raise errors.Resource.GetError( + f'No GKE node version found for minor {initial_minor!r}; ' + f'available top 5: {valid[:5]}' + ) + + def WaitForOperation(self, op_handle: str) -> None: + """Polls a GKE operation until terminal; raises on failure.""" + + @vm_util.Retry( + poll_interval=5, + fuzz=0, + timeout=ONE_HOUR, + retryable_exceptions=(errors.Resource.RetryableCreationError,), + ) + def _poll(): + describe = self._GcloudCommand( + 'container', + 'operations', + 'describe', + op_handle, + ) + #describe.flags['format'] = 'value(status)' + describe.flags['format'] = 'json' + out, err, rc = describe.Issue(raise_on_failure=False) + if rc: + raise errors.Resource.RetryableCreationError( + f'describe op failed: {err}' + ) + #status = out.strip() + try: + status = json.loads(out).get('status') + except (json.JSONDecodeError, ValueError): + status = out.strip() + if status == 'DONE': + return + if status in ('ABORTING', 'ABORTED'): + raise errors.Resource.CreationError(f'op {op_handle} aborted') + raise errors.Resource.RetryableCreationError( + f'op {op_handle} status={status}' + ) + + _poll() class GkeAutopilotCluster(BaseGkeCluster): """Class representing an Autopilot GKE cluster, which has no nodepools.""" @@ -732,4 +1076,4 @@ def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]: def ResizeNodePool( self, new_size: int, node_pool: str = container_cluster.DEFAULT_NODEPOOL ): - raise NotImplementedError('Autopilot clusters do not support resizing.') + raise NotImplementedError('Autopilot clusters do not support resizing.') \ No newline at end of file diff --git a/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py b/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py index 9b98d15508..fecb126114 100644 --- a/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py +++ b/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py @@ -1,5 +1,6 @@ """Classes related to KubernetesCluster.""" +import abc import functools import json import logging @@ -10,7 +11,7 @@ from perfkitbenchmarker import vm_util from perfkitbenchmarker.configs import container_spec as container_spec_lib from perfkitbenchmarker.resources import kubernetes_inference_server -from perfkitbenchmarker.resources.container_service import container as container_lib +from perfkitbenchmarker.resources.container_service import (container as container_lib) from perfkitbenchmarker.resources.container_service import container_cluster from perfkitbenchmarker.resources.container_service import kubectl from perfkitbenchmarker.resources.container_service import kubernetes @@ -54,6 +55,7 @@ def Create(self, restore: bool = False) -> None: self.inference_server.Create() def _PostCreate(self): + """Starts the event poller after the cluster has been created.""" super()._PostCreate() if self.event_poller: self.event_poller.StartPolling() @@ -151,6 +153,7 @@ def GetDefaultStorageClass(self) -> str: def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]: """Gets the node selectors section of a yaml for the provider.""" + del machine_type # Unused; subclasses may use it. return {} def ModifyPodSpecPlacementYaml( @@ -165,9 +168,9 @@ def ModifyPodSpecPlacementYaml( the most likely to change from cloud to cloud. Args: - yaml_dicts: The list of yaml dicts to search through & modify. See - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#podspec-v1-core - for documentation on the pod spec fields. This is modified in place. + yaml_dicts: The list of yaml dicts to search through & modify. See the + K8s PodSpec API docs for pod spec field documentation. Modified + in place. name: The name of the app. machine_type: A specified machine type to request. """ @@ -195,9 +198,8 @@ def _ModifyPodSpecPlacementYaml( the most likely to change from cloud to cloud. Args: - pod_spec_yaml: The pod spec yaml to modify. See - https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#podspec-v1-core - for documentation on the pod spec fields. This is modified in place. + pod_spec_yaml: The pod spec yaml to modify. See the K8s PodSpec API + docs for pod spec field documentation. This is modified in place. name: The name of the app. machine_type: A specified machine type to request. """ @@ -304,9 +306,126 @@ def _GetAddressFromIngress(self, ingress_out: str): ) return 'http://' + ip.strip() - def AddNodepool(self, batch_name: str, pool_id: str): - """Adds an additional nodepool with the given name to the cluster.""" - pass + def AddNodepool(self, batch_name: str, pool_id: str) -> None: + """Adds a node pool; delegates to CreateNodePool for standard clusters. + + Karpenter-based subclasses override this to apply a manifest instead. + """ + nodepool_config = container_lib.BaseNodePoolConfig( + name=f'{batch_name}-{pool_id}', + ) + self.CreateNodePool(nodepool_config) + + def CreateNodePool( + self, + nodepool_config: container_lib.BaseNodePoolConfig, + node_version: str | None = None, + ) -> None: + """Creates a single named node pool on the cluster (blocks until ready). + + Args: + nodepool_config: Node pool definition (name, machine type, node count). + node_version: Optional Kubernetes version to pin the node pool to. None + means use the cluster default. + """ + raise NotImplementedError + + def DeleteNodePool(self, name: str) -> None: + """Deletes the named node pool (blocks until removed).""" + raise NotImplementedError + + def UpgradeNodePool(self, name: str, target_version: str) -> None: + """Upgrades the named node pool to the given Kubernetes version.""" + raise NotImplementedError + + def UpdateCluster(self) -> None: + """Performs a lightweight cluster-level update operation (blocks). + + Intended for management-plane benchmarks that need to overlap a real + cluster-level operation with a node-pool operation. The implementation + should issue a control-plane mutation (so an actual operation runs) that + is non-destructive and idempotent across repeated invocations. + """ + raise NotImplementedError + + def CreateNodePoolAsync( + self, + nodepool_config: container_lib.BaseNodePoolConfig, + node_version: str | None = None, + ) -> str: + """Initiates node-pool create; returns opaque op handle. Does NOT wait.""" + raise NotImplementedError + + def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str: + """Initiates node-pool upgrade; returns opaque op handle. Does NOT wait.""" + raise NotImplementedError + + def DeleteNodePoolAsync(self, name: str) -> str: + """Initiates node-pool delete; returns opaque op handle. Does NOT wait.""" + raise NotImplementedError + + def UpdateClusterAsync(self) -> str: + """Initiates cluster-level update. Returns op handle; does NOT wait.""" + raise NotImplementedError + + @abc.abstractmethod + def GetNodePoolNames(self) -> list[str]: + """Returns the names of all node pools currently in the cluster. + + Used by the kubernetes_management benchmark to: + - Sweep stale pkbm* pools before each run (clean-start spec requirement) + - Re-list live pools after creates before deleting (avoids stale names) + """ + + def WaitForOperation(self, op_handle: str) -> None: + """Blocks until the operation identified by op_handle completes. + + Args: + op_handle: provider-specific opaque string from one of the *Async + methods above. + + Raises: + errors.Resource.RetryableCreationError or similar on timeout/failure. + """ + raise NotImplementedError + + def ResolveNodePoolVersions(self) -> tuple[str, str]: + """Returns (initial, target) K8s versions per benchmark spec. + + Spec contract: + target = cluster's current K8s version (the latest available) + initial = the adjacent minor below target (e.g., target=1.35 -> 1.34) + Default implementation returns bare-minor strings ("1.34", "1.35") which + EKS and AKS accept directly. Providers requiring fully-qualified versions + (notably GKE) must override. + """ + target = BareMinor(self.k8s_version) + initial = AdjacentMinorBelow(self.k8s_version) + return initial, target + + +def BareMinor(version: str) -> str: + """Returns the 'major.minor' part of a K8s version string. + + Accepts and normalizes formats like 'v1.35.4', '1.35.4-gke.1234', '1.35'. + """ + if version.startswith('v'): + version = version[1:] + bare = version.split('-', 1)[0] + parts = bare.split('.') + if len(parts) < 2 or not parts[0].isdigit() or not parts[1].isdigit(): + raise ValueError(f'Cannot parse K8s version: {version!r}') + return f'{parts[0]}.{parts[1]}' + + +def AdjacentMinorBelow(version: str) -> str: + """Returns the bare minor one below the given version: '1.35.4' -> '1.34'.""" + bare = BareMinor(version) + major_s, minor_s = bare.split('.') + minor = int(minor_s) + if minor <= 0: + raise ValueError(f'No adjacent minor below {version!r}') + return f'{major_s}.{minor - 1}' def _DeleteAllFromDefaultNamespace(): diff --git a/tests/linux_benchmarks/kubernetes_management_benchmark_test.py b/tests/linux_benchmarks/kubernetes_management_benchmark_test.py new file mode 100644 index 0000000000..6852c8df46 --- /dev/null +++ b/tests/linux_benchmarks/kubernetes_management_benchmark_test.py @@ -0,0 +1,1105 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for linux_benchmarks.kubernetes_management_benchmark.""" + +# pylint: disable=invalid-name,protected-access + +import threading +import time +import unittest +from unittest import mock + +from absl import flags +from absl.testing import flagsaver +from perfkitbenchmarker import errors +from perfkitbenchmarker import sample +from perfkitbenchmarker.linux_benchmarks import kubernetes_management_benchmark +from perfkitbenchmarker.resources.container_service import kubernetes_cluster +from tests import pkb_common_test_case + +FLAGS = flags.FLAGS + +_CLUSTER_NAME = 'test-cluster' + + +def _make_sample(metric, value, unit='seconds', metadata=None): + return sample.Sample(metric, value, unit, metadata or {}) + + +def _make_mock_cluster( + name=_CLUSTER_NAME, + k8s_version='1.34', + pool_names=None, +): + """Creates a fully-stubbed KubernetesCluster mock for use in tests.""" + cluster = mock.create_autospec( + kubernetes_cluster.KubernetesCluster, instance=True + ) + cluster.name = name + cluster.k8s_version = k8s_version + cluster.cluster_version = k8s_version + cluster.GetNodePoolNames.return_value = pool_names or [] + cluster.ResolveNodePoolVersions.return_value = ('1.33', '1.34') + cluster.CreateNodePoolAsync.return_value = 'op-create-1' + cluster.UpgradeNodePoolAsync.return_value = 'op-upgrade-1' + cluster.DeleteNodePoolAsync.return_value = 'op-delete-1' + cluster.UpdateClusterAsync.return_value = 'op-update-1' + cluster.WaitForOperation.return_value = None + default_np = mock.MagicMock() + default_np.machine_type = 'e2-standard-2' + default_np.num_nodes = 1 + default_np.min_nodes = 1 + default_np.max_nodes = 1 + default_np.zone = 'us-central1-a' + default_np.disk_size = 100 + default_np.name = 'default-pool' + cluster.default_nodepool = default_np + return cluster + + +def _make_mock_benchmark_spec(cluster=None): + spec = mock.MagicMock() + spec.container_cluster = cluster or _make_mock_cluster() + return spec + + +def _make_mock_config(cluster_type='Kubernetes'): + cfg = mock.MagicMock() + cfg.container_cluster.type = cluster_type + return cfg + + +class ScenarioNameTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for _SCENARIO_A_NAME, _SCENARIO_B_NAME, _SCENARIO_C_NAME.""" + + def testScenarioANameZeroPadsToThreeDigits(self): + self.assertEqual( + 'pkbma000', + kubernetes_management_benchmark._ScenarioAName(0), + ) + + def testScenarioANameTwoDigitIndex(self): + self.assertEqual( + 'pkbma042', + kubernetes_management_benchmark._ScenarioAName(42), + ) + + def testScenarioANameMaxThreeDigits(self): + self.assertEqual( + 'pkbma999', + kubernetes_management_benchmark._ScenarioAName(999), + ) + + def testScenarioBNameIsConstant(self): + self.assertEqual( + 'pkbmb', + kubernetes_management_benchmark._SCENARIO_B_NAME, + ) + + def testScenarioCNameZeroPadsToFourDigits(self): + self.assertEqual( + 'pkbmc0000', + kubernetes_management_benchmark._ScenarioCName(0), + ) + + def testScenarioCNameSingleDigitIndex(self): + self.assertEqual( + 'pkbmc0007', + kubernetes_management_benchmark._ScenarioCName(7), + ) + + def testScenarioCNameFourDigitIndex(self): + self.assertEqual( + 'pkbmc1000', + kubernetes_management_benchmark._ScenarioCName(1000), + ) + + def testAllNamesWithinAksLimit(self): + for i in range(1000): + self.assertLessEqual( + len(kubernetes_management_benchmark._ScenarioAName(i)), 12 + ) + for i in range(10000): + self.assertLessEqual( + len(kubernetes_management_benchmark._ScenarioCName(i)), 12 + ) + self.assertLessEqual( + len(kubernetes_management_benchmark._SCENARIO_B_NAME), 12 + ) + + +class CheckPrerequisitesTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the CheckPrerequisites validation function.""" + + def testValidScenariosPass(self): + with flagsaver.flagsaver(k8s_mgmt_scenarios=['A', 'B', 'C']): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + def testInvalidScenarioRaises(self): + with flagsaver.flagsaver(k8s_mgmt_scenarios=['X']): + with self.assertRaises(errors.Config.InvalidValue): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + def testMixedValidInvalidRaises(self): + with flagsaver.flagsaver(k8s_mgmt_scenarios=['A', 'Z']): + with self.assertRaises(errors.Config.InvalidValue): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + def testNonKubernetesClusterTypeRaises(self): + with flagsaver.flagsaver(k8s_mgmt_scenarios=['A']): + with self.assertRaises(errors.Config.InvalidValue): + kubernetes_management_benchmark.CheckPrerequisites( + _make_mock_config(cluster_type='Mesos') + ) + + def testInvalidScaleSweepRaises(self): + with flagsaver.flagsaver( + k8s_mgmt_scenarios=['C'], k8s_mgmt_scale_sweep=['10', 'abc'] + ): + with self.assertRaises(errors.Config.InvalidValue): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + def testValidScaleSweepPasses(self): + with flagsaver.flagsaver( + k8s_mgmt_scenarios=['C'], k8s_mgmt_scale_sweep=['10', '50', '100'] + ): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + def testLowercaseScenarioRaises(self): + with flagsaver.flagsaver(k8s_mgmt_scenarios=['a']): + with self.assertRaises(errors.Config.InvalidValue): + kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config()) + + +class PrepareTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the Prepare benchmark lifecycle function.""" + + def _patch_kubectl(self, rc=0): + return mock.patch( + 'perfkitbenchmarker.resources.container_service.kubectl' + + '.RunKubectlCommand', + return_value=('', '', rc), + ) + + def testPrepareRunsKubectlSleepPod(self): + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl() as mock_kubectl: + kubernetes_management_benchmark.Prepare(bm_spec) + mock_kubectl.assert_called_once() + args = mock_kubectl.call_args[0][0] + self.assertIn('run', args) + self.assertIn('pkb-mgmt-sleep', args) + self.assertIn('sleep', args) + + def testPrepareSetsAlwaysCallCleanup(self): + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl(): + kubernetes_management_benchmark.Prepare(bm_spec) + self.assertTrue(bm_spec.always_call_cleanup) + + def testPrepareToleratesKubectlNonZeroReturn(self): + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl(rc=1): + kubernetes_management_benchmark.Prepare(bm_spec) + + +class CleanupTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the Cleanup benchmark lifecycle function.""" + + def _patch_kubectl(self): + return mock.patch( + 'perfkitbenchmarker.resources.container_service.kubectl' + + '.RunKubectlCommand', + return_value=('', '', 0), + ) + + def testCleanupDeletesSleepPod(self): + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl() as mock_kubectl: + kubernetes_management_benchmark.Cleanup(bm_spec) + delete_calls = [ + str(c) for c in mock_kubectl.call_args_list + if 'pkb-mgmt-sleep' in str(c) + ] + self.assertNotEmpty(delete_calls) + + def testCleanupDeletesAllPkbmPrefixedPools(self): + cluster = _make_mock_cluster( + pool_names=['pkbma000', 'default-pool', 'pkbmc0001'] + ) + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl(): + kubernetes_management_benchmark.Cleanup(bm_spec) + deleted = {c.args[0] for c in cluster.DeleteNodePool.call_args_list} + self.assertIn('pkbma000', deleted) + self.assertIn('pkbmc0001', deleted) + self.assertNotIn('default-pool', deleted) + + def testCleanupSkipsDeleteWhenNoLeftoverPools(self): + cluster = _make_mock_cluster(pool_names=['default-pool']) + bm_spec = _make_mock_benchmark_spec(cluster) + with self._patch_kubectl(): + kubernetes_management_benchmark.Cleanup(bm_spec) + cluster.DeleteNodePool.assert_not_called() + + def testCleanupHandlesNoneCluster(self): + bm_spec = _make_mock_benchmark_spec() + bm_spec.container_cluster = None + kubernetes_management_benchmark.Cleanup(bm_spec) + + +class CleanStartSweepTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _CleanStartSweep helper function.""" + + def testDeletesStalePkbmPools(self): + cluster = _make_mock_cluster( + pool_names=['pkbma000', 'pkbmc0001', 'user-pool'] + ) + kubernetes_management_benchmark._CleanStartSweep(cluster) + deleted = {c.args[0] for c in cluster.DeleteNodePool.call_args_list} + self.assertIn('pkbma000', deleted) + self.assertIn('pkbmc0001', deleted) + self.assertNotIn('user-pool', deleted) + + def testDoesNothingWhenNoPkbmPools(self): + cluster = _make_mock_cluster(pool_names=['user-pool', 'default-pool']) + kubernetes_management_benchmark._CleanStartSweep(cluster) + cluster.DeleteNodePool.assert_not_called() + + def testToleratesGetNodePoolNamesException(self): + cluster = _make_mock_cluster() + cluster.GetNodePoolNames.side_effect = RuntimeError('API error') + kubernetes_management_benchmark._CleanStartSweep(cluster) + + +class ResultsTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _Results result-accumulator helper.""" + + def testAddSingleEntry(self): + r = kubernetes_management_benchmark._Results() + r.add('op1', 0.1, 1.0, None) + self.assertLen(r.entries, 1) + name, init, e2e, err = r.entries[0] + self.assertEqual('op1', name) + self.assertAlmostEqual(0.1, init, places=5) + self.assertAlmostEqual(1.0, e2e, places=5) + self.assertIsNone(err) + + def testAddMultipleEntries(self): + r = kubernetes_management_benchmark._Results() + r.add('op1', 0.1, 1.0, None) + r.add('op2', 0.2, 2.0, ValueError('fail')) + self.assertLen(r.entries, 2) + + def testAddIsThreadSafe(self): + """Tests that concurrent add() calls from multiple threads are safe.""" + r = kubernetes_management_benchmark._Results() + n = 100 + + def _add(i): + r.add(f'op{i}', float(i), float(i) * 2, None) + + threads = [threading.Thread(target=_add, args=(i,)) for i in range(n)] + for t in threads: + t.start() + for t in threads: + t.join() + self.assertLen(r.entries, n) + + def testAddPreservesError(self): + r = kubernetes_management_benchmark._Results() + exc = RuntimeError('test error') + r.add('failing-op', 0.5, 0.5, exc) + _, _, _, err = r.entries[0] + self.assertIs(exc, err) + + +class TimedAsyncTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _TimedAsync timing helper.""" + + def testSuccessfulKickoffAndWait(self): + kickoff = mock.Mock(return_value='op-handle') + wait_fn = mock.Mock(return_value=None) + init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync( + kickoff, wait_fn + ) + kickoff.assert_called_once() + wait_fn.assert_called_once_with('op-handle') + self.assertIsNone(err) + self.assertGreaterEqual(init_lat, 0.0) + self.assertGreaterEqual(e2e_lat, init_lat) + + def testKickoffFailureReturnsError(self): + exc = RuntimeError('kickoff failed') + kickoff = mock.Mock(side_effect=exc) + wait_fn = mock.Mock() + init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync( + kickoff, wait_fn + ) + self.assertIs(exc, err) + wait_fn.assert_not_called() + self.assertAlmostEqual(init_lat, e2e_lat, places=2) + + def testWaitFailureReturnsError(self): + exc = RuntimeError('wait failed') + kickoff = mock.Mock(return_value='op-handle') + wait_fn = mock.Mock(side_effect=exc) + _, e2e_lat, err = kubernetes_management_benchmark._TimedAsync( + kickoff, wait_fn + ) + self.assertIs(exc, err) + self.assertGreater(e2e_lat, 0.0) + + def testInitLatencyNotGreaterThanE2eLatency(self): + kickoff = mock.Mock(return_value='handle') + wait_fn = mock.Mock(side_effect=lambda _: time.sleep(0.01)) + init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync( + kickoff, wait_fn + ) + self.assertIsNone(err) + self.assertLessEqual(init_lat, e2e_lat) + + def testHandlePassedToWaitFn(self): + kickoff = mock.Mock(return_value='my-op-handle') + wait_fn = mock.Mock() + kubernetes_management_benchmark._TimedAsync(kickoff, wait_fn) + wait_fn.assert_called_once_with('my-op-handle') + + +class RunAsyncTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _RunAsync concurrent execution helper.""" + + def testEmptyItemsReturnsEmptyList(self): + results = kubernetes_management_benchmark._RunAsync( + kickoff=mock.Mock(), + wait_fn=mock.Mock(), + items=[], + get_name=str, + ) + self.assertEmpty(results) + + @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50) + def testReturnsOneResultPerItem(self): + kickoff = mock.Mock(return_value='op-handle') + wait_fn = mock.Mock(return_value=None) + results = kubernetes_management_benchmark._RunAsync( + kickoff=kickoff, wait_fn=wait_fn, items=['a', 'b', 'c'], get_name=str + ) + self.assertLen(results, 3) + self.assertEqual({'a', 'b', 'c'}, {name for name, _, _, _ in results}) + + @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50) + def testKickoffErrorCapturedInResults(self): + kickoff = mock.Mock(side_effect=RuntimeError('kaboom')) + results = kubernetes_management_benchmark._RunAsync( + kickoff=kickoff, wait_fn=mock.Mock(), items=['x'], get_name=str + ) + self.assertLen(results, 1) + _, _, _, err = results[0] + self.assertIsNotNone(err) + + @flagsaver.flagsaver(k8s_mgmt_max_concurrent=2) + def testConcurrencyCapDoesNotDropItems(self): + results = kubernetes_management_benchmark._RunAsync( + kickoff=mock.Mock(return_value='op'), + wait_fn=mock.Mock(return_value=None), + items=list(range(5)), + get_name=str, + ) + self.assertLen(results, 5) + + @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50) + def testGetNameCallableApplied(self): + cfg = mock.MagicMock() + cfg.name = 'poolname' + results = kubernetes_management_benchmark._RunAsync( + kickoff=mock.Mock(return_value='h'), + wait_fn=mock.Mock(), + items=[cfg], + get_name=lambda c: c.name, + ) + name, _, _, _ = results[0] + self.assertEqual('poolname', name) + + +class MakeNodePoolConfigTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _MakeNodePoolConfig factory.""" + + @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=3) + def testNameIsSet(self): + cluster = _make_mock_cluster() + cfg = kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'mypool') + self.assertEqual('mypool', cfg.name) + + @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=3) + def testNumNodesComesFromFlag(self): + cluster = _make_mock_cluster() + cfg = kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'p') + self.assertEqual(3, cfg.num_nodes) + self.assertEqual(3, cfg.min_nodes) + self.assertEqual(3, cfg.max_nodes) + + @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=1) + def testDoesNotMutateDefaultNodepool(self): + cluster = _make_mock_cluster() + original_name = cluster.default_nodepool.name + kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'newname') + self.assertEqual(original_name, cluster.default_nodepool.name) + + +class OpSamplesTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _OpSamples sample-generation helper.""" + + def testEmptyResultsYieldsSuccessRateOfZero(self): + samples = kubernetes_management_benchmark._OpSamples( + 'PrefixOp', [], attempted_ops=5 + ) + rate = next(s for s in samples if s.metric == 'PrefixOp_SuccessRate') + self.assertEqual(0.0, rate.value) + + def testPerOpInitiationAndE2eSamplesGenerated(self): + results = [('op1', 0.1, 1.0, None), ('op2', 0.2, 2.0, None)] + samples = kubernetes_management_benchmark._OpSamples( + 'MyOp', results, attempted_ops=2 + ) + metrics = [s.metric for s in samples] + self.assertIn('MyOp_InitiationLatency', metrics) + self.assertIn('MyOp_EndToEndLatency', metrics) + + def testSuccessRateHundredPercentWhenAllSucceed(self): + results = [('op1', 1.0, 2.0, None), ('op2', 0.5, 1.5, None)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=2 + ) + rate = next(s for s in samples if s.metric == 'Op_SuccessRate') + self.assertAlmostEqual(100.0, rate.value) + + def testSuccessRateFiftyPercentWhenHalfFail(self): + results = [ + ('op1', 1.0, 2.0, None), + ('op2', 0.5, 0.5, RuntimeError('fail')), + ] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=2 + ) + rate = next(s for s in samples if s.metric == 'Op_SuccessRate') + self.assertAlmostEqual(50.0, rate.value) + + def testAttemptedOpsExceedingExecutedOpsLowersRate(self): + results = [('op1', 1.0, 2.0, None)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=3 + ) + rate = next(s for s in samples if s.metric == 'Op_SuccessRate') + self.assertAlmostEqual(100.0 / 3, rate.value, places=3) + + def testSuccessRateMetadataFields(self): + results = [('op1', 1.0, 2.0, None), ('op2', 0.5, 0.5, Exception('err'))] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=3 + ) + rate = next(s for s in samples if s.metric == 'Op_SuccessRate') + self.assertEqual('3', rate.metadata['total_ops']) + self.assertEqual('2', rate.metadata['executed_ops']) + self.assertEqual('1', rate.metadata['successful_ops']) + self.assertEqual('1', rate.metadata['skipped_ops']) + + def testFailedOpIncludesErrorMessage(self): + results = [('fail-op', 0.5, 0.5, RuntimeError('oops'))] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=1 + ) + init_s = next(s for s in samples if s.metric == 'Op_InitiationLatency') + self.assertIn('error', init_s.metadata) + self.assertIn('oops', init_s.metadata['error']) + + def testAggregatesGeneratedForTwoOrMoreSuccesses(self): + results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 4)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=3 + ) + metrics = [s.metric for s in samples] + self.assertIn('Op_InitiationLatency_Mean', metrics) + self.assertIn('Op_EndToEndLatency_Mean', metrics) + + def testAggregatesNotGeneratedForSingleSuccess(self): + results = [('op1', 1.0, 2.0, None)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=1 + ) + self.assertNotIn('Op_InitiationLatency_Mean', + [s.metric for s in samples]) + + def testOutliersGeneratedForFourOrMoreSuccesses(self): + results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 6)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=5 + ) + metrics = [s.metric for s in samples] + self.assertIn('Op_InitiationLatency_OutlierCount', metrics) + self.assertIn('Op_EndToEndLatency_OutlierCount', metrics) + + def testOutliersNotGeneratedForThreeOrFewerSuccesses(self): + results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 4)] + samples = kubernetes_management_benchmark._OpSamples( + 'Op', results, attempted_ops=3 + ) + self.assertNotIn('Op_InitiationLatency_OutlierCount', + [s.metric for s in samples]) + + +class AggregateSamplesTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _AggregateSamples statistics helper.""" + + def testProducesAllExpectedStatMetrics(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Pfx', 'InitiationLatency', [1.0, 2.0, 3.0, 4.0, 5.0] + ) + metrics = {s.metric for s in samples} + for label in ('Mean', 'StdDev', 'Min', 'Median', 'P90', 'P99', 'Max'): + self.assertIn(f'Pfx_InitiationLatency_{label}', metrics) + + def testMeanValueCorrect(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0] + ) + mean_s = next(s for s in samples if 'Mean' in s.metric) + self.assertAlmostEqual(3.0, mean_s.value, places=3) + + def testMinValueCorrect(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Op', 'E2E', [10.0, 20.0, 30.0] + ) + min_s = next(s for s in samples if 'Min' in s.metric) + self.assertAlmostEqual(10.0, min_s.value, places=3) + + def testMaxValueCorrect(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Op', 'E2E', [10.0, 20.0, 30.0] + ) + max_s = next(s for s in samples if 'Max' in s.metric) + self.assertAlmostEqual(30.0, max_s.value, places=3) + + def testSampleCountInMetadata(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Op', 'E2E', [1.0, 2.0, 3.0] + ) + for s in samples: + self.assertEqual('3', s.metadata.get('sample_count')) + + def testUnitsAreSeconds(self): + samples = kubernetes_management_benchmark._AggregateSamples( + 'Op', 'E2E', [1.0, 2.0] + ) + for s in samples: + self.assertEqual('seconds', s.unit) + + +class OutlierSamplesTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _OutlierSamples IQR-based outlier detection helper.""" + + def testNoOutliersYieldsZeroCount(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', [1.0, 1.1, 1.2, 1.3, 1.4, 1.5] + ) + self.assertLen(samples, 1) + self.assertEqual(0, samples[0].value) + + def testClearOutlierDetected(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', [1.0, 1.0, 1.0, 1.0, 100.0] + ) + self.assertEqual(1, samples[0].value) + + def testMetricNameFormatted(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'MyPrefix', 'InitiationLatency', [1.0, 2.0, 3.0, 4.0] + ) + self.assertEqual( + 'MyPrefix_InitiationLatency_OutlierCount', samples[0].metric + ) + + def testMetadataContainsFenceFields(self): + meta = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0] + )[0].metadata + for field in ('q1', 'q3', 'iqr', 'upper_fence', 'lower_fence', + 'sample_count'): + self.assertIn(field, meta) + + def testSampleCountInMetadata(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0] + ) + self.assertEqual('5', samples[0].metadata['sample_count']) + + def testUnitIsCount(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', [1.0, 2.0, 3.0, 4.0] + ) + self.assertEqual('count', samples[0].unit) + + def testReturnsSingleSample(self): + samples = kubernetes_management_benchmark._OutlierSamples( + 'Op', 'E2E', list(range(1, 11)) + ) + self.assertLen(samples, 1) + + +class RunTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the Run benchmark entry-point function.""" + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['A', 'B', 'C'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunCallsCleanStartSweep(self): + """Tests that Run invokes _CleanStartSweep before executing scenarios.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ) as mock_clean, mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ): + kubernetes_management_benchmark.Run(bm_spec) + mock_clean.assert_called_once_with(cluster) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['A'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunOnlyScenarioACallsOnlyA(self): + """Tests that Run only calls _RunScenarioA when scenarios=['A'].""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ) as mock_a, mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ) as mock_b, mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ) as mock_c: + kubernetes_management_benchmark.Run(bm_spec) + mock_a.assert_called_once() + mock_b.assert_not_called() + mock_c.assert_not_called() + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['B'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunOnlyScenarioBCallsOnlyB(self): + """Tests that Run only calls _RunScenarioB when scenarios=['B'].""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ) as mock_a, mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ) as mock_b, mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ) as mock_c: + kubernetes_management_benchmark.Run(bm_spec) + mock_a.assert_not_called() + mock_b.assert_called_once() + mock_c.assert_not_called() + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['C'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=42, + ) + def testRunScenarioCPassesLargeScaleFlag(self): + """Tests that Run passes the large-scale-nodepools flag to _RunScenarioC.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ) as mock_c: + kubernetes_management_benchmark.Run(bm_spec) + mock_c.assert_called_once() + _, _, scale = mock_c.call_args.args + self.assertEqual(42, scale) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['C'], + k8s_mgmt_scale_sweep=['10', '50'], + k8s_mgmt_large_scale_nodepools=100, + ) + def testRunScenarioCScaleSweepRunsTwice(self): + """Tests that Run calls _RunScenarioC once per scale in the sweep.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, + '_RunScenarioC', + return_value=[_make_sample('m', 1.0)], + ) as mock_c: + kubernetes_management_benchmark.Run(bm_spec) + self.assertEqual(2, mock_c.call_count) + scales = [call.args[2] for call in mock_c.call_args_list] + self.assertIn(10, scales) + self.assertIn(50, scales) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['C'], + k8s_mgmt_scale_sweep=['10'], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunTagsScenarioCScaleInMetadata(self): + """Tests that Run adds scenario_c_scale to each sample's metadata.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + test_sample = _make_sample('metric', 1.0) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioA', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, + '_RunScenarioC', + return_value=[test_sample], + ): + samples = kubernetes_management_benchmark.Run(bm_spec) + self.assertIn('scenario_c_scale', samples[0].metadata) + self.assertEqual('10', samples[0].metadata['scenario_c_scale']) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['A'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunTagsAllSamplesWithRunMetadata(self): + """Tests that Run adds version and config keys to all sample metadata.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + test_sample = _make_sample('m', 1.0) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, + '_RunScenarioA', + return_value=[test_sample], + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ): + samples = kubernetes_management_benchmark.Run(bm_spec) + meta = samples[0].metadata + for key in ('initial_version', 'target_version', 'cluster_k8s_version', + 'nodes_per_nodepool', 'concurrent_nodepools'): + self.assertIn(key, meta) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['A'], + k8s_mgmt_initial_version='1.30', + k8s_mgmt_target_version='1.31', + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunUsesExplicitVersionFlags(self): + """Tests that Run uses explicit version flags over auto-resolved ones.""" + cluster = _make_mock_cluster() + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, + '_RunScenarioA', + return_value=[_make_sample('m', 1.0)], + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ): + samples = kubernetes_management_benchmark.Run(bm_spec) + cluster.ResolveNodePoolVersions.assert_not_called() + self.assertEqual('1.30', samples[0].metadata['initial_version']) + self.assertEqual('1.31', samples[0].metadata['target_version']) + + @flagsaver.flagsaver( + k8s_mgmt_scenarios=['A'], + k8s_mgmt_scale_sweep=[], + k8s_mgmt_large_scale_nodepools=10, + ) + def testRunAutoResolvesVersionsWhenFlagsAbsent(self): + """Tests Run calls ResolveNodePoolVersions when version flags absent.""" + cluster = _make_mock_cluster() + cluster.ResolveNodePoolVersions.return_value = ('1.33', '1.34') + bm_spec = _make_mock_benchmark_spec(cluster) + with mock.patch.object( + kubernetes_management_benchmark, '_CleanStartSweep' + ), mock.patch.object( + kubernetes_management_benchmark, + '_RunScenarioA', + return_value=[_make_sample('m', 1.0)], + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioB', return_value=[] + ), mock.patch.object( + kubernetes_management_benchmark, '_RunScenarioC', return_value=[] + ): + samples = kubernetes_management_benchmark.Run(bm_spec) + cluster.ResolveNodePoolVersions.assert_called_once() + self.assertEqual('1.33', samples[0].metadata['initial_version']) + self.assertEqual('1.34', samples[0].metadata['target_version']) + + +class RunScenarioATest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _RunScenarioA phase-by-phase and pipelined modes.""" + + @flagsaver.flagsaver( + k8s_mgmt_concurrent_nodepools=2, + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + k8s_mgmt_pipeline_scenario_a=False, + ) + def testPhaseByPhaseProducesCreateUpgradeDeleteSamples(self): + """Tests Scenario A produces Create, Upgrade, and Delete samples.""" + cluster = _make_mock_cluster(pool_names=['pkbma000', 'pkbma001']) + samples = kubernetes_management_benchmark._RunScenarioA( + cluster, '1.33', '1.34' + ) + metrics = {s.metric for s in samples} + self.assertTrue(any('ScenarioA_Create' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Delete' in m for m in metrics)) + + @flagsaver.flagsaver( + k8s_mgmt_concurrent_nodepools=2, + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + k8s_mgmt_pipeline_scenario_a=False, + ) + def testPhaseByPhasePassesInitialVersionToCreate(self): + """Tests _RunScenarioA passes initial_version to CreateNodePoolAsync.""" + cluster = _make_mock_cluster(pool_names=['pkbma000', 'pkbma001']) + kubernetes_management_benchmark._RunScenarioA(cluster, '1.33', '1.34') + for call in cluster.CreateNodePoolAsync.call_args_list: + kw = call.kwargs if call.kwargs else {} + pos = call.args + node_version = ( + kw.get('node_version') or (pos[1] if len(pos) > 1 else None) + ) + self.assertEqual('1.33', node_version) + + @flagsaver.flagsaver( + k8s_mgmt_concurrent_nodepools=2, + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + k8s_mgmt_pipeline_scenario_a=False, + ) + def testPhaseByPhaseDeleteUsesLivePoolList(self): + """Tests that _RunScenarioA deletes only the pools it finds at runtime.""" + cluster = _make_mock_cluster(pool_names=['pkbma000']) + kubernetes_management_benchmark._RunScenarioA(cluster, '1.33', '1.34') + self.assertEqual(1, cluster.DeleteNodePoolAsync.call_count) + + @flagsaver.flagsaver( + k8s_mgmt_concurrent_nodepools=2, + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + k8s_mgmt_pipeline_scenario_a=True, + ) + def testPipelinedModeActivatedByFlag(self): + """Tests pipelined mode is activated by the pipeline_scenario_a flag.""" + cluster = _make_mock_cluster(pool_names=[]) + samples = kubernetes_management_benchmark._RunScenarioA( + cluster, '1.33', '1.34' + ) + metrics = {s.metric for s in samples} + self.assertTrue(any('ScenarioA_Create' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Delete' in m for m in metrics)) + + +class RunScenarioAPipelinedTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _RunScenarioAPipelined pipelined execution path.""" + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testPipelinedProducesAllThreePhases(self): + """Tests pipelined Scenario A produces Create/Upgrade/Delete samples.""" + cluster = _make_mock_cluster(pool_names=[]) + samples = kubernetes_management_benchmark._RunScenarioAPipelined( + cluster, n=2, initial='1.33', target='1.34' + ) + metrics = {s.metric for s in samples} + self.assertTrue(any('ScenarioA_Create' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics)) + self.assertTrue(any('ScenarioA_Delete' in m for m in metrics)) + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testPipelinedSkipsUpgradeAfterCreateFailure(self): + """Tests pipelined mode skips upgrade when create fails.""" + cluster = _make_mock_cluster(pool_names=[]) + cluster.CreateNodePoolAsync.side_effect = RuntimeError('create failed') + samples = kubernetes_management_benchmark._RunScenarioAPipelined( + cluster, n=1, initial='1.33', target='1.34' + ) + cluster.UpgradeNodePoolAsync.assert_not_called() + upgrade_rate = next( + (s for s in samples if s.metric == 'ScenarioA_Upgrade_SuccessRate'), + None, + ) + if upgrade_rate is not None: + self.assertEqual(0.0, upgrade_rate.value) + + +class RunScenarioBTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _RunScenarioB cluster-update + nodepool-create scenario.""" + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testProducesClusterUpdateAndNodePoolCreateSamples(self): + cluster = _make_mock_cluster(pool_names=[]) + samples = kubernetes_management_benchmark._RunScenarioB(cluster, '1.33') + metrics = {s.metric for s in samples} + self.assertTrue(any('ScenarioB_ClusterUpdate' in m for m in metrics)) + self.assertTrue(any('ScenarioB_NodePoolCreate' in m for m in metrics)) + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testDeletesTestPoolAfterRun(self): + cluster = _make_mock_cluster(pool_names=[]) + kubernetes_management_benchmark._RunScenarioB(cluster, '1.33') + cluster.DeleteNodePool.assert_called_once_with( + kubernetes_management_benchmark._SCENARIO_B_NAME + ) + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testToleratesDeleteFailure(self): + cluster = _make_mock_cluster(pool_names=[]) + cluster.DeleteNodePool.side_effect = RuntimeError('delete failed') + kubernetes_management_benchmark._RunScenarioB(cluster, '1.33') + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testPassesInitialVersionToCreate(self): + """Tests _RunScenarioB passes initial_version to CreateNodePoolAsync.""" + cluster = _make_mock_cluster(pool_names=[]) + kubernetes_management_benchmark._RunScenarioB(cluster, '1.33') + for call in cluster.CreateNodePoolAsync.call_args_list: + kw = call.kwargs if call.kwargs else {} + pos = call.args + node_version = ( + kw.get('node_version') or (pos[1] if len(pos) > 1 else None) + ) + self.assertEqual('1.33', node_version) + + +class RunScenarioCTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the _RunScenarioC large-scale create-and-delete scenario.""" + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testProducesCreateAndDeleteSamples(self): + cluster = _make_mock_cluster(pool_names=['pkbmc0000', 'pkbmc0001']) + samples = kubernetes_management_benchmark._RunScenarioC( + cluster, '1.33', scale=2 + ) + metrics = {s.metric for s in samples} + self.assertTrue(any('ScenarioC_Create' in m for m in metrics)) + self.assertTrue(any('ScenarioC_Delete' in m for m in metrics)) + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testZeroLivePoolsRecordsZeroDeleteSuccessRate(self): + """Tests Scenario C records 0% delete rate when no live pools exist.""" + cluster = _make_mock_cluster(pool_names=[]) + samples = kubernetes_management_benchmark._RunScenarioC( + cluster, '1.33', scale=3 + ) + delete_rate = next( + s for s in samples if s.metric == 'ScenarioC_Delete_SuccessRate' + ) + self.assertEqual(0.0, delete_rate.value) + cluster.DeleteNodePoolAsync.assert_not_called() + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testDeleteUsesLiveListNotOriginalCreateList(self): + cluster = _make_mock_cluster(pool_names=['pkbmc0000', 'pkbmc0001']) + kubernetes_management_benchmark._RunScenarioC(cluster, '1.33', scale=3) + self.assertEqual(2, cluster.DeleteNodePoolAsync.call_count) + + @flagsaver.flagsaver( + k8s_mgmt_nodes_per_nodepool=1, + k8s_mgmt_max_concurrent=50, + ) + def testCreateSuccessRateUsesScaleAsDenominator(self): + """Tests Scenario C create success rate uses scale as total_ops.""" + cluster = _make_mock_cluster(pool_names=['pkbmc0000']) + samples = kubernetes_management_benchmark._RunScenarioC( + cluster, '1.33', scale=3 + ) + create_rate = next( + s for s in samples if s.metric == 'ScenarioC_Create_SuccessRate' + ) + self.assertLessEqual(create_rate.value, 100.0) + self.assertEqual('3', create_rate.metadata['total_ops']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/providers/aws/elastic_kubernetes_service_test.py b/tests/providers/aws/elastic_kubernetes_service_test.py index 90d28eb834..bc8744d565 100644 --- a/tests/providers/aws/elastic_kubernetes_service_test.py +++ b/tests/providers/aws/elastic_kubernetes_service_test.py @@ -1,12 +1,16 @@ +"""Tests for the AWS Elastic Kubernetes Service provider.""" +# pylint: disable=invalid-name,protected-access + import json import os import tempfile import unittest from unittest import mock from urllib import parse -from absl.testing import flagsaver -from absl.testing import parameterized +from absl.testing import flagsaver # pylint: disable=import-error +from absl.testing import parameterized # pylint: disable=import-error from perfkitbenchmarker import data +from perfkitbenchmarker import errors from perfkitbenchmarker import network from perfkitbenchmarker import vm_util from perfkitbenchmarker.configs import container_spec @@ -34,6 +38,7 @@ class BaseEksTest(pkb_common_test_case.PkbCommonTestCase): + """Base test class providing common EKS cluster setup and mock helpers.""" def setUp(self): super().setUp() @@ -80,11 +85,13 @@ def MockJsonRead(self, cluster: elastic_kubernetes_service.BaseEksCluster): class ElasticKubernetesServiceTest(BaseEksTest): + """Tests for the managed-nodegroup EksCluster provider.""" def testInitEksClusterWorks(self): elastic_kubernetes_service.EksCluster(EKS_SPEC) def testEksClusterCreateRegion(self): + """EksCluster._Create() without explicit AZ omits availabilityZones.""" self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]}) spec = container_spec.ContainerClusterSpec( 'NAME', @@ -121,6 +128,9 @@ def testEksClusterCreateRegion(self): ) def testEksClusterCreateZone(self): + """EksCluster._Create() with a zone issues the expected eksctl commands.""" + ebs_policy = 'arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy' + ebs_role = 'arn:aws:iam::1234:role/AmazonEKS_EBS_CSI_DriverRole_pkb-123p' issue_command = self.MockIssueCommand( {'create cluster': [('Cluster created', '', 0)]} ) @@ -136,7 +146,7 @@ def testEksClusterCreateZone(self): '--namespace=kube-system', '--region=us-west-1', '--cluster=pkb-123p', - '--attach-policy-arn=arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy', + f'--attach-policy-arn={ebs_policy}', '--approve', '--role-only', '--role-name=AmazonEKS_EBS_CSI_DriverRole_pkb-123p', @@ -148,7 +158,7 @@ def testEksClusterCreateZone(self): '--name=aws-ebs-csi-driver', '--region=us-west-1', '--cluster=pkb-123p', - '--service-account-role-arn=arn:aws:iam::1234:role/AmazonEKS_EBS_CSI_DriverRole_pkb-123p', + f'--service-account-role-arn={ebs_role}', ]), ]) assert self.patched_read_json is not None @@ -158,6 +168,7 @@ def testEksClusterCreateZone(self): ) def testEksClusterNodepools(self): + """Additional nodepools appear in the managedNodeGroups config.""" self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]}) spec2 = EKS_SPEC_DICT.copy() spec2['nodepools'] = { @@ -200,6 +211,7 @@ def testEksClusterNodepools(self): ) def testEksClusterNodepoolsAutoscaling(self): + """Autoscaling min/max/desired values propagate to managedNodeGroups.""" self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]}) spec2 = EKS_SPEC_DICT.copy() spec2['min_vm_count'] = 1 @@ -236,6 +248,7 @@ def testEksClusterNodepoolsAutoscaling(self): self.assertEqual(node_groups[1]['desiredCapacity'], 3) def testGetNodePoolNames(self): + """GetNodePoolNames returns list of nodegroup names from eksctl output.""" # Mock the output of the aws cli command cluster = elastic_kubernetes_service.EksCluster(EKS_SPEC) @@ -255,6 +268,7 @@ def testGetNodePoolNames(self): ) def testGetNodePoolNamesKarpenter(self): + """GetNodePoolNames on Karpenter cluster returns kubectl nodepool names.""" cluster = elastic_kubernetes_service.EksKarpenterCluster(EKS_SPEC) self.MockIssueCommand({ 'kubectl --kubeconfig get nodepool -o json': [( @@ -275,6 +289,7 @@ def testGetNodePoolNamesKarpenter(self): ('standard nodepool', 'nginx', 'nginx'), ) def testEksClusterGetNodepoolFromName(self, nodepool_name, expected_name): + """GetNodePoolFromNodeName resolves a node name to its nodepool.""" self.MockIssueCommand({'get node': [(nodepool_name, '', 0)]}) spec2 = EKS_SPEC_DICT.copy() spec2['nodepools'] = { @@ -296,6 +311,7 @@ def testEksClusterGetNodepoolFromName(self, nodepool_name, expected_name): self.assertEqual(nodepool.name, expected_name) def testEksClusterNotFound(self): + """GetNodePoolFromNodeName returns None when node is not found.""" self.MockIssueCommand({'get node': [('', '', 0)]}) spec2 = EKS_SPEC_DICT.copy() spec2['nodepools'] = { @@ -326,6 +342,7 @@ def testEksClusterGetMachineTypeFromNodeName(self): class EksAutoClusterTest(BaseEksTest): + """Tests for the auto-mode EksAutoCluster provider.""" def testInitEksClusterWorks(self): elastic_kubernetes_service.EksAutoCluster(EKS_SPEC) @@ -340,6 +357,7 @@ def testEksClusterCreate(self): self.assertEqual(called_json['autoModeConfig'], {'enabled': True}) def testEksClusterIsReady(self): + """EksAutoCluster._IsReady() returns True when cluster-info succeeds.""" self.enter_context( mock.patch.object( kubectl, @@ -347,7 +365,8 @@ def testEksClusterIsReady(self): return_value=( ( r'^[[0;32mKubernetes control plane^[[0m is running at' - r' ^[[0;33mhttps://RAND1234.gr7.us-west-1.eks.amazonaws.com^[[0mTo' + r' ^[[0;33mhttps://RAND1234.gr7.us-west-1.' + r'eks.amazonaws.com^[[0mTo' " further debug and diagnose cluster problems, use 'kubectl" " cluster-info dump'." ), @@ -361,6 +380,7 @@ def testEksClusterIsReady(self): class EksKarpenterTest(BaseEksTest): + """Tests for the Karpenter-based EksKarpenterCluster provider.""" def setUp(self): super().setUp() @@ -380,6 +400,7 @@ def testInitEksClusterWorks(self): @flagsaver.flagsaver(kubeconfig='/tmp/kubeconfig') def testEksYamlCreateFull(self): + """EksKarpenterCluster._Create() produces the expected eksctl yaml.""" cluster = elastic_kubernetes_service.EksKarpenterCluster(EKS_SPEC) self.MockJsonRead(cluster) mock_cmd = self.MockIssueCommand({ @@ -452,6 +473,7 @@ def testEksYamlCreateFull(self): ) @flagsaver.flagsaver(kubeconfig='/tmp/kubeconfig') def testEksYamlCreateFullNodepools(self, nodepool_config, expected_content): + """EksKarpenterCluster._PostCreate() logs expected nodepool yaml.""" # Mock resources for _PostCreate self.MockIssueCommand({ 'helm upgrade --install karpenter': [('', '', 0)], @@ -515,28 +537,17 @@ def testRecursiveDictionaryUpdate(self): expected = {'a': 3, 'deep': {'c': 2, 'd': 4}, 'f': 12} self.assertEqual( expected, - elastic_kubernetes_service.RecursivelyUpdateDictionary(base, update), + elastic_kubernetes_service._recursively_update_dictionary(base, update), ) def testIngressAddressParsing(self): """Test parsing AWS ALB address with dualstack prefix removal.""" + elb_host = 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com' test_cases = [ - ( - 'http://dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - ), - ( - 'https://dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - ), - ( - 'dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - ), - ( - 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com', - ), + (f'http://dualstack.{elb_host}', elb_host), + (f'https://dualstack.{elb_host}', elb_host), + (f'dualstack.{elb_host}', elb_host), + (elb_host, elb_host), ] for address, expected in test_cases: with self.subTest(address=address): @@ -549,5 +560,289 @@ def testIngressAddressParsing(self): self.assertEqual(normalized, expected) +class EksManagementPlaneTest(BaseEksTest): + """Tests for EKS management-plane methods (k8s_management_benchmark).""" + + def _make_cluster(self, spec_dict=None): + spec = container_spec.ContainerClusterSpec( + 'NAME', + **(spec_dict or EKS_SPEC_DICT), + ) + cluster = elastic_kubernetes_service.EksCluster(spec) + self.MockJsonRead(cluster) + # Individual tests override via MockIssueCommand. + return cluster + + def _make_nodepool_config(self, name='pkbpool0', machine_type='m5.large', + num_nodes=2): + cfg = mock.MagicMock() + cfg.name = name + cfg.num_nodes = num_nodes + cfg.machine_type = machine_type + return cfg + + # ---- CreateNodePoolAsync -------------------------------------------------- + + def testCreateNodePoolAsyncIssuesCreateNodegroup(self): + """CreateNodePoolAsync calls create-nodegroup; returns ng_active handle.""" + cluster = self._make_cluster() + # Subnets / AZ discovery stubs + cluster._cached_subnets = ['subnet-1'] + cluster._cached_subnets_per_az = {} + cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole' + self.MockIssueCommand({'create-nodegroup': [('', '', 0)]}) + + handle = cluster.CreateNodePoolAsync(self._make_nodepool_config('poolA')) + + self.assertEqual('ng_active:poolA', handle) + # Verify the json file path was written + self.assertIsNotNone(self.patched_read_json) + + def testCreateNodePoolAsyncReturnsNgActiveHandle(self): + """CreateNodePoolAsync returns 'ng_active:' on success.""" + cluster = self._make_cluster() + cluster._cached_subnets = ['subnet-1'] + cluster._cached_subnets_per_az = {} + cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole' + self.MockIssueCommand({'': [('', '', 0)]}) + + handle = cluster.CreateNodePoolAsync(self._make_nodepool_config('myng')) + self.assertEqual('ng_active:myng', handle) + + def testCreateNodePoolAsyncRaisesOnFailure(self): + """CreateNodePoolAsync raises CreationError when the CLI fails.""" + cluster = self._make_cluster() + cluster._cached_subnets = ['subnet-1'] + cluster._cached_subnets_per_az = {} + cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole' + self.MockIssueCommand({'': [('', 'error msg', 1)]}) + + with self.assertRaises(Exception): + cluster.CreateNodePoolAsync(self._make_nodepool_config('failng')) + + # ---- UpgradeNodePoolAsync ------------------------------------------------- + + def testUpgradeNodePoolAsyncReturnsNgActiveHandle(self): + """UpgradeNodePoolAsync calls update-nodegroup-version; returns handle.""" + cluster = self._make_cluster() + mock_cmd = self.MockIssueCommand( + {'update-nodegroup-version': [('', '', 0)]} + ) + handle = cluster.UpgradeNodePoolAsync('my-ng', '1.34') + + self.assertEqual('ng_active:my-ng', handle) + self.assertIn('update-nodegroup-version', mock_cmd.all_commands) + self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands) + + def testUpgradeNodePoolAsyncRaisesOnFailure(self): + """UpgradeNodePoolAsync raises on non-zero exit code.""" + cluster = self._make_cluster() + self.MockIssueCommand({'': [('', 'oops', 1)]}) + with self.assertRaises(Exception): + cluster.UpgradeNodePoolAsync('bad-ng', '1.34') + + # ---- DeleteNodePoolAsync -------------------------------------------------- + + def testDeleteNodePoolAsyncReturnsNgGoneHandle(self): + """DeleteNodePoolAsync calls delete-nodegroup, returns ng_gone handle.""" + cluster = self._make_cluster() + mock_cmd = self.MockIssueCommand({'delete-nodegroup': [('', '', 0)]}) + handle = cluster.DeleteNodePoolAsync('old-ng') + + self.assertEqual('ng_gone:old-ng', handle) + self.assertIn('delete-nodegroup', mock_cmd.all_commands) + self.assertIn('--nodegroup-name old-ng', mock_cmd.all_commands) + + # ---- UpdateClusterAsync --------------------------------------------------- + + def testUpdateClusterAsyncReturnsClusterUpdateHandle(self): + """UpdateClusterAsync returns 'cluster_update:'.""" + cluster = self._make_cluster() + describe_out = json.dumps({ + 'cluster': {'logging': {'clusterLogging': []}} + }) + update_out = json.dumps({'update': {'id': 'u-abc123'}}) + self.MockIssueCommand({ + 'describe-cluster': [(describe_out, '', 0)], + 'update-cluster-config': [(update_out, '', 0)], + }) + handle = cluster.UpdateClusterAsync() + self.assertEqual('cluster_update:u-abc123', handle) + + def testUpdateClusterAsyncTogglesLogging(self): + """UpdateClusterAsync toggles logging enable state.""" + cluster = self._make_cluster() + # Current state: logging disabled + describe_out = json.dumps({ + 'cluster': {'logging': {'clusterLogging': [{'enabled': False}]}} + }) + update_out = json.dumps({'update': {'id': 'u-xyz'}}) + mock_cmd = self.MockIssueCommand({ + 'describe-cluster': [(describe_out, '', 0)], + 'update-cluster-config': [(update_out, '', 0)], + }) + cluster.UpdateClusterAsync() + self.assertIn('update-cluster-config', mock_cmd.all_commands) + self.assertIn('--logging', mock_cmd.all_commands) + + # ---- WaitForOperation ----------------------------------------------------- + + def testWaitForOperationNgActiveSuccess(self): + """WaitForOperation(ng_active:name) returns when nodegroup is ACTIVE.""" + cluster = self._make_cluster() + ng_out = json.dumps({'nodegroup': {'status': 'ACTIVE'}}) + self.MockIssueCommand({'describe-nodegroup': [(ng_out, '', 0)]}) + # Should not raise + cluster.WaitForOperation('ng_active:my-ng') + + def testWaitForOperationNgActiveFailedRaises(self): + """WaitForOperation raises CreationError on CREATE_FAILED nodegroup.""" + cluster = self._make_cluster() + ng_out = json.dumps({'nodegroup': {'status': 'CREATE_FAILED'}}) + self.MockIssueCommand({'describe-nodegroup': [(ng_out, '', 0)]}) + with self.assertRaises(Exception): + cluster.WaitForOperation('ng_active:bad-ng') + + def testWaitForOperationNgGoneSuccess(self): + """WaitForOperation(ng_gone:name) returns on ResourceNotFoundException.""" + cluster = self._make_cluster() + self.MockIssueCommand({ + 'describe-nodegroup': [('', 'ResourceNotFoundException', 1)] + }) + # Should not raise + cluster.WaitForOperation('ng_gone:deleted-ng') + + def testWaitForOperationClusterUpdateSuccess(self): + """WaitForOperation(cluster_update:id) returns when update is Successful.""" + cluster = self._make_cluster() + self.MockIssueCommand({'describe-update': [('Successful\n', '', 0)]}) + # Should not raise + cluster.WaitForOperation('cluster_update:u-999') + + def testWaitForOperationClusterUpdateFailedRaises(self): + """WaitForOperation raises when cluster update ends in Failed.""" + cluster = self._make_cluster() + self.MockIssueCommand({'describe-update': [('Failed\n', '', 0)]}) + with self.assertRaises(Exception): + cluster.WaitForOperation('cluster_update:u-fail') + + def testWaitForOperationUnknownHandleRaises(self): + """WaitForOperation raises ValueError for unknown handle prefix.""" + cluster = self._make_cluster() + with self.assertRaises(ValueError): + cluster.WaitForOperation('unknown_handle:xyz') + + # ---- ResolveNodePoolVersions ---------------------------------------------- + + def testResolveNodePoolVersionsNMinus1Math(self): + """ResolveNodePoolVersions returns (N-1, N) from cluster_version.""" + cluster = self._make_cluster() + cluster.cluster_version = '1.34' + initial, target = cluster.ResolveNodePoolVersions() + self.assertEqual('1.33', initial) + self.assertEqual('1.34', target) + + def testResolveNodePoolVersionsStripsMinorPatch(self): + """ResolveNodePoolVersions strips patch from version strings.""" + cluster = self._make_cluster() + cluster.cluster_version = '1.33.7' + initial, target = cluster.ResolveNodePoolVersions() + self.assertEqual('1.32', initial) + self.assertEqual('1.33', target) + + # ---- _DiscoverSubnets ----------------------------------------------------- + + def testDiscoverSubnets(self): + """_DiscoverSubnets returns subnet IDs from describe-cluster.""" + cluster = self._make_cluster() + describe_out = json.dumps({ + 'cluster': { + 'resourcesVpcConfig': { + 'subnetIds': ['subnet-aaa', 'subnet-bbb'] + } + } + }) + self.MockIssueCommand({'describe-cluster': [(describe_out, '', 0)]}) + subnets = cluster._DiscoverSubnets() + self.assertEqual(['subnet-aaa', 'subnet-bbb'], subnets) + + def testDiscoverSubnetsCached(self): + """_DiscoverSubnets uses cached result on second call.""" + cluster = self._make_cluster() + cluster._cached_subnets = ['subnet-cached'] + # No IssueCommand calls expected because cache is used + with mock.patch.object(vm_util, 'IssueCommand') as mock_issue: + result = cluster._DiscoverSubnets() + mock_issue.assert_not_called() + self.assertEqual(['subnet-cached'], result) + + # ---- _DiscoverSubnetsPerAZ ------------------------------------------------ + + def testDiscoverSubnetsPerAZBuildsAzMap(self): + """_DiscoverSubnetsPerAZ builds a {AZ: subnet_id} map from EC2.""" + cluster = self._make_cluster() + cluster._cached_subnets = ['subnet-a1', 'subnet-b2'] + subnets_out = json.dumps([ + {'SubnetId': 'subnet-a1', 'AZ': 'us-west-1a'}, + {'SubnetId': 'subnet-b2', 'AZ': 'us-west-1b'}, + ]) + self.MockIssueCommand({'describe-subnets': [(subnets_out, '', 0)]}) + az_map = cluster._DiscoverSubnetsPerAZ() + self.assertEqual({'us-west-1a': 'subnet-a1', 'us-west-1b': 'subnet-b2'}, + az_map) + + # ---- _DiscoverNodeRoleArn ------------------------------------------------- + + def testDiscoverNodeRoleArn(self): + """_DiscoverNodeRoleArn returns role ARN from the first nodegroup.""" + cluster = self._make_cluster() + list_out = json.dumps({'nodegroups': ['ng1']}) + describe_out = json.dumps({ + 'nodegroup': {'nodeRole': 'arn:aws:iam::1234:role/MyRole'} + }) + self.MockIssueCommand({ + 'list-nodegroups': [(list_out, '', 0)], + 'describe-nodegroup': [(describe_out, '', 0)], + }) + arn = cluster._DiscoverNodeRoleArn() + self.assertEqual('arn:aws:iam::1234:role/MyRole', arn) + + def testDiscoverNodeRoleArnRaisesWhenNoNodegroup(self): + """_DiscoverNodeRoleArn raises CreationError when no nodegroups found.""" + cluster = self._make_cluster() + list_out = json.dumps({'nodegroups': []}) + self.MockIssueCommand({'list-nodegroups': [(list_out, '', 0)]}) + with self.assertRaises(errors.Resource.CreationError): + cluster._DiscoverNodeRoleArn() + + # ---- _ResolveReleaseVersion ----------------------------------------------- + + def testResolveReleaseVersion(self): + """_ResolveReleaseVersion returns the SSM parameter value.""" + cluster = self._make_cluster() + self.MockIssueCommand({ + 'get-parameter': [('1.33.10-20260101\n', '', 0)] + }) + version = cluster._ResolveReleaseVersion('1.33') + self.assertEqual('1.33.10-20260101', version) + + def testResolveReleaseVersionCached(self): + """_ResolveReleaseVersion uses cache for repeated calls.""" + cluster = self._make_cluster() + self.MockIssueCommand({ + 'get-parameter': [('1.34.2-20260101\n', '', 0)] + }) + v1 = cluster._ResolveReleaseVersion('1.34') + v2 = cluster._ResolveReleaseVersion('1.34') + self.assertEqual(v1, v2) + + def testResolveReleaseVersionRaisesOnFailure(self): + """_ResolveReleaseVersion raises CreationError when SSM lookup fails.""" + cluster = self._make_cluster() + self.MockIssueCommand({'get-parameter': [('', 'not found', 1)]}) + with self.assertRaises(errors.Resource.CreationError): + cluster._ResolveReleaseVersion('1.99') + + if __name__ == '__main__': unittest.main() diff --git a/tests/providers/azure/azure_kubernetes_service_test.py b/tests/providers/azure/azure_kubernetes_service_test.py index 7ca09fb29c..3f6334e998 100644 --- a/tests/providers/azure/azure_kubernetes_service_test.py +++ b/tests/providers/azure/azure_kubernetes_service_test.py @@ -1,3 +1,6 @@ +"""Tests for the Azure Kubernetes Service provider.""" +# pylint: disable=invalid-name,protected-access + import unittest from unittest import mock from absl.testing import flagsaver @@ -7,12 +10,14 @@ from perfkitbenchmarker.providers.azure import azure_kubernetes_service from perfkitbenchmarker.providers.azure import azure_network from perfkitbenchmarker.providers.azure import util -from tests import pkb_common_test_case +from tests import pkb_common_test_case # pylint: disable=no-name-in-module class AzureKubernetesServiceTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the AksCluster provider.""" def setUp(self): + """Sets up mocks and creates a default AksCluster for each test.""" super().setUp() self.enter_context( mock.patch.object( @@ -70,6 +75,7 @@ def initAksCluster(self, spec_dict): self.aks.resource_group.args = [] def testCreate(self): + """AksCluster.Create() issues the expected az aks create command.""" mock_cmd = self.MockIssueCommand( { 'az aks create': [('', '', 0)], @@ -119,6 +125,7 @@ def testCreate(self): ) def testCreateError(self): + """AksCluster.Create() raises CreationError when az aks create fails.""" self.MockIssueCommand( { 'az aks create': [('out', 'Error could not create', 1)], @@ -141,6 +148,7 @@ def testCreateError(self): self.aks.Create() def testCreateNodepool(self): + """Additional nodepools appear in az aks nodepool add commands.""" mock_cmd = self.MockIssueCommand( { 'az aks create': [('', '', 0)], @@ -207,6 +215,7 @@ def testCreateAutoscaler(self): ) def testCreateAutoscaler_NodepoolAndClamps(self): + """Autoscaler min/max/desired values propagate to nodepool add commands.""" mock_cmd = self.MockIssueCommand( { 'az aks create': [('', '', 0)], @@ -229,12 +238,13 @@ def testCreateAutoscaler_NodepoolAndClamps(self): self.aks._Create() self.assertIn( '--enable-cluster-autoscaler --min-count=4 --max-count=6' - ' --node-count=4', + + ' --node-count=4', mock_cmd.all_commands, ) @flagsaver.flagsaver(kubectl='kubectl', kubeconfig='dummy') def testFullCreateAksAutomatic(self): + """AksAutomaticCluster.Create() issues RBAC and policy assignment cmds.""" aks_auto = azure_kubernetes_service.AksAutomaticCluster(self.spec) aks_auto.resource_group.name = 'resource-group' mock_cmd = self.MockIssueCommand( @@ -258,7 +268,8 @@ def testFullCreateAksAutomatic(self): ('servicePrincipal', '', 0), ('user-name', '', 0), ( - 'test-user@example.com\n12345678-1234-1234-1234-123456789abc', + 'test-user@example.com\n' + + '12345678-1234-1234-1234-123456789abc', '', 0, ), @@ -272,7 +283,7 @@ def testFullCreateAksAutomatic(self): aks_auto.Create() self.assertIn( 'az role assignment create --assignee user-name --role Azure Kubernetes' - ' Service RBAC Admin', + + ' Service RBAC Admin', mock_cmd.all_commands, ) self.assertIn( @@ -281,11 +292,12 @@ def testFullCreateAksAutomatic(self): ) self.assertIn( 'az policy assignment update --name' - ' aks-deployment-safeguards-policy-assignment', + + ' aks-deployment-safeguards-policy-assignment', mock_cmd.all_commands, ) def testGetNodePoolNames(self): + """GetNodePoolNames returns pool names from az aks nodepool list output.""" self.MockIssueCommand( { 'az aks nodepool list': [( @@ -308,5 +320,238 @@ def testGetNodePoolNames(self): self.assertEqual(self.aks.GetNodePoolNames(), ['default', 'nodepool1']) +class AksManagementPlaneTest(AzureKubernetesServiceTest): + """Tests for AKS management-plane methods (k8s_management_benchmark).""" + + # These tests are inherited from AzureKubernetesServiceTest but are not + # relevant to the management-plane test suite. Override to skip them so + # they don't pollute the AksManagementPlaneTest results. + def testCreate(self): + pass + + def testCreateError(self): + pass + + def _make_nodepool_config(self, name='pkbpool0', + machine_type='Standard_D2s_v5', + num_nodes=2): + cfg = mock.MagicMock() + cfg.name = name + cfg.num_nodes = num_nodes + cfg.machine_type = machine_type + cfg.min_nodes = num_nodes + cfg.max_nodes = num_nodes + cfg.disk_size = 100 + return cfg + + # ---- CreateNodePool ------------------------------------------------------- + + def testCreateNodePool(self): + """CreateNodePool issues 'az aks nodepool add' with cluster-name.""" + mock_cmd = self.MockIssueCommand({'az aks nodepool add': [('', '', 0)]}) + self.aks.CreateNodePool(self._make_nodepool_config('testpool')) + + self.assertIn('az aks nodepool add', mock_cmd.all_commands) + self.assertIn('--cluster-name', mock_cmd.all_commands) + self.assertIn('--labels', mock_cmd.all_commands) + + def testCreateNodePoolWithVersion(self): + """CreateNodePool passes --kubernetes-version when node_version is set.""" + self.aks.cluster_version = '1.33' + mock_cmd = self.MockIssueCommand({'az aks nodepool add': [('', '', 0)]}) + self.aks.CreateNodePool( + self._make_nodepool_config('verpool'), node_version='1.32' + ) + self.assertIn('--kubernetes-version 1.32', mock_cmd.all_commands) + + def testCreateNodePoolRaisesOnFailure(self): + """CreateNodePool raises CreationError when CLI fails.""" + self.MockIssueCommand({'az aks nodepool add': [('', 'error', 1)]}) + with self.assertRaises(errors.Resource.CreationError): + self.aks.CreateNodePool(self._make_nodepool_config('failpool')) + + # ---- DeleteNodePool ------------------------------------------------------- + + def testDeleteNodePool(self): + """DeleteNodePool issues 'az aks nodepool delete' with cluster-name.""" + mock_cmd = self.MockIssueCommand( + {'az aks nodepool delete': [('', '', 0)]} + ) + self.aks.DeleteNodePool('old-pool') + + self.assertIn('az aks nodepool delete', mock_cmd.all_commands) + self.assertIn('--cluster-name', mock_cmd.all_commands) + + # ---- UpgradeNodePool ------------------------------------------------------ + + def testUpgradeNodePool(self): + """UpgradeNodePool issues 'az aks nodepool upgrade' with version.""" + mock_cmd = self.MockIssueCommand( + {'az aks nodepool upgrade': [('', '', 0)]} + ) + self.aks.UpgradeNodePool('my-pool', '1.34') + + self.assertIn('az aks nodepool upgrade', mock_cmd.all_commands) + self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands) + + # ---- UpdateCluster -------------------------------------------------------- + + def testUpdateCluster(self): + """UpdateCluster issues 'az aks update' with a timestamp tag.""" + mock_cmd = self.MockIssueCommand({'az aks update': [('', '', 0)]}) + self.aks.UpdateCluster() + + self.assertIn('az aks update', mock_cmd.all_commands) + self.assertIn('--tags', mock_cmd.all_commands) + self.assertIn('k8s-mgmt-ts=', mock_cmd.all_commands) + + # ---- CreateNodePoolAsync -------------------------------------------------- + + def testCreateNodePoolAsyncReturnsNpSucceededHandle(self): + """CreateNodePoolAsync issues nodepool add with --no-wait.""" + mock_cmd = self.MockIssueCommand( + {'az aks nodepool add': [('', '', 0)]} + ) + handle = self.aks.CreateNodePoolAsync(self._make_nodepool_config('apool')) + + self.assertIn('--no-wait', mock_cmd.all_commands) + self.assertTrue(handle.startswith('np_succeeded:')) + + def testCreateNodePoolAsyncRaisesOnFailure(self): + """CreateNodePoolAsync raises CreationError on CLI failure.""" + self.MockIssueCommand({'az aks nodepool add': [('', 'err', 1)]}) + with self.assertRaises(errors.Resource.CreationError): + self.aks.CreateNodePoolAsync(self._make_nodepool_config('failpool')) + + # ---- UpgradeNodePoolAsync ------------------------------------------------- + + def testUpgradeNodePoolAsyncReturnsNpSucceededHandle(self): + """UpgradeNodePoolAsync issues upgrade with --no-wait.""" + mock_cmd = self.MockIssueCommand( + {'az aks nodepool upgrade': [('', '', 0)]} + ) + handle = self.aks.UpgradeNodePoolAsync('my-pool', '1.34') + + self.assertIn('--no-wait', mock_cmd.all_commands) + self.assertTrue(handle.startswith('np_succeeded:')) + self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands) + + # ---- DeleteNodePoolAsync -------------------------------------------------- + + def testDeleteNodePoolAsyncReturnsNpGoneHandle(self): + """DeleteNodePoolAsync issues delete with --no-wait.""" + mock_cmd = self.MockIssueCommand( + {'az aks nodepool delete': [('', '', 0)]} + ) + handle = self.aks.DeleteNodePoolAsync('rm-pool') + + self.assertIn('--no-wait', mock_cmd.all_commands) + self.assertTrue(handle.startswith('np_gone:')) + + # ---- UpdateClusterAsync --------------------------------------------------- + + def testUpdateClusterAsyncScalesSystemPool(self): + """UpdateClusterAsync scales the system pool; returns cluster_succeeded.""" + pools_json = '[{"name": "nodepool1", "count": 2}]' + self.MockIssueCommand({ + 'az aks nodepool list': [(pools_json, '', 0)], + 'az aks nodepool scale': [('', '', 0)], + }) + handle = self.aks.UpdateClusterAsync() + self.assertEqual('cluster_succeeded', handle) + + def testUpdateClusterAsyncFallbackTagUpdate(self): + """UpdateClusterAsync falls back to tag update when nodepool list fails.""" + self.MockIssueCommand({ + 'az aks nodepool list': [('', 'err', 1)], + 'az aks update': [('', '', 0)], + }) + handle = self.aks.UpdateClusterAsync() + self.assertEqual('cluster_succeeded', handle) + + # ---- WaitForOperation ----------------------------------------------------- + + def testWaitForOperationNpSucceeded(self): + """WaitForOperation(np_succeeded:name) returns on Succeeded state.""" + self.MockIssueCommand( + {'az aks nodepool show': [('Succeeded\n', '', 0)]} + ) + # Should not raise + self.aks.WaitForOperation('np_succeeded:mypool') + + def testWaitForOperationNpSucceededFailedRaises(self): + """WaitForOperation raises CreationError on Failed provisioningState.""" + self.MockIssueCommand( + {'az aks nodepool show': [('Failed\n', '', 0)]} + ) + with self.assertRaises(errors.Resource.CreationError): + self.aks.WaitForOperation('np_succeeded:failpool') + + def testWaitForOperationNpGone(self): + """WaitForOperation(np_gone:name) returns when nodepool is not found.""" + self.MockIssueCommand({ + 'az aks nodepool show': [('', 'NotFound', 1)] + }) + # Should not raise + self.aks.WaitForOperation('np_gone:deleted-pool') + + def testWaitForOperationClusterSucceeded(self): + """WaitForOperation(cluster_succeeded) returns on Succeeded state.""" + self.MockIssueCommand({ + 'az aks show': [('Succeeded\n', '', 0)] + }) + # Should not raise + self.aks.WaitForOperation('cluster_succeeded') + + def testWaitForOperationClusterSucceededFailedRaises(self): + """WaitForOperation raises CreationError when cluster update is Failed.""" + self.MockIssueCommand({ + 'az aks show': [('Failed\n', '', 0)] + }) + with self.assertRaises(errors.Resource.CreationError): + self.aks.WaitForOperation('cluster_succeeded') + + def testWaitForOperationUnknownHandleRaises(self): + """WaitForOperation raises ValueError for an unknown handle prefix.""" + with self.assertRaises(ValueError): + self.aks.WaitForOperation('bad_handle:something') + + # ---- ResolveNodePoolVersions ---------------------------------------------- + + def testResolveNodePoolVersionsNMinus1Math(self): + """ResolveNodePoolVersions returns (N-1, N) from cluster_version.""" + self.aks.cluster_version = '1.34' + initial, target = self.aks.ResolveNodePoolVersions() + self.assertEqual('1.33', initial) + self.assertEqual('1.34', target) + + def testResolveNodePoolVersionsStripsMinorPatch(self): + """ResolveNodePoolVersions strips patch from full version string.""" + self.aks.cluster_version = '1.33.5' + initial, target = self.aks.ResolveNodePoolVersions() + self.assertEqual('1.32', initial) + self.assertEqual('1.33', target) + + # ---- _GetNodeFlags with version_override ---------------------------------- + + def testGetNodeFlagsVersionOverride(self): + """_GetNodeFlags uses version_override instead of cluster_version.""" + self.aks.cluster_version = '1.34' + cfg = self._make_nodepool_config() + flags = self.aks._GetNodeFlags(cfg, version_override='1.33') + self.assertIn('--kubernetes-version', flags) + idx = flags.index('--kubernetes-version') + self.assertEqual('1.33', flags[idx + 1]) + + def testGetNodeFlagsUsesClusterVersionWhenNoOverride(self): + """_GetNodeFlags uses cluster_version when version_override is None.""" + self.aks.cluster_version = '1.34' + cfg = self._make_nodepool_config() + flags = self.aks._GetNodeFlags(cfg, version_override=None) + self.assertIn('--kubernetes-version', flags) + idx = flags.index('--kubernetes-version') + self.assertEqual('1.34', flags[idx + 1]) + + if __name__ == '__main__': unittest.main() diff --git a/tests/providers/gcp/google_kubernetes_engine_test.py b/tests/providers/gcp/google_kubernetes_engine_test.py index dbf8232f5e..d49ac77d2a 100644 --- a/tests/providers/gcp/google_kubernetes_engine_test.py +++ b/tests/providers/gcp/google_kubernetes_engine_test.py @@ -13,10 +13,11 @@ # limitations under the License. """Tests for perfkitbenchmarker.providers.gcp.google_kubernetes_engine.""" -# pylint: disable=not-context-manager +# pylint: disable=not-context-manager,invalid-name,protected-access import builtins import contextlib +import json import os import tempfile import unittest @@ -61,6 +62,7 @@ class PatchedObjectsTestCase(pkb_common_test_case.PkbCommonTestCase): def patch_critical_objects( self, stdout='', stderr='', return_code=0, flags=FLAGS ): + """Patches common objects and yields a mock IssueCommand.""" with contextlib.ExitStack() as stack: flags.gcloud_path = 'gcloud' flags.run_uri = _RUN_URI @@ -99,10 +101,12 @@ def patch_critical_objects( class GoogleContainerRegistryTestCase(PatchedObjectsTestCase): + """Tests for the GoogleArtifactRegistry container registry.""" class FakeContainerImage(container.ContainerImage): + """Minimal ContainerImage stub for registry tests.""" - def __init__(self, name, directory=None): + def __init__(self, name, directory=None): # pylint: disable=super-init-not-called self.name = name self.directory = directory or f'docker/{name}/Dockerfile' @@ -117,6 +121,7 @@ def setUp(self): ) def testFullRegistryTag(self): + """Tests that full registry tag is constructed correctly.""" spec = container_spec.ContainerRegistrySpec( 'NAME', **{ @@ -132,6 +137,7 @@ def testFullRegistryTag(self): ) def testRemoteBuildCreateSucceeds(self): + """Tests that _Build succeeds when gcloud Issue returns success.""" spec = container_spec.ContainerRegistrySpec( 'NAME', **{ @@ -147,9 +153,11 @@ def testRemoteBuildCreateSucceeds(self): class GoogleKubernetesEngineCustomMachineTypeTestCase(PatchedObjectsTestCase): + """Tests for GKE cluster creation with a custom machine type.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE spec with a custom CPU/memory machine type.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -180,9 +188,11 @@ def testCreate(self): class GoogleKubernetesEngineTestCase(PatchedObjectsTestCase): + """Tests for standard GKE cluster create/delete/exists operations.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a standard GKE cluster spec with typical VM options.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -204,6 +214,7 @@ def create_kubernetes_engine_spec(): return kubernetes_engine_spec def testCreate(self): + """Tests that _Create issues the correct gcloud command with all flags.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -242,6 +253,7 @@ def testCreateQuotaExceeded(self): cluster._Create() def testCreateResourcesExhausted(self): + """Tests _Create raises InsufficientCapacityCloudFailure on exhaustion.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects( stderr=""" @@ -258,6 +270,7 @@ def testCreateResourcesExhausted(self): cluster._Create() def testGetCredentials(self): + """Tests that _PostCreate issues get-credentials with KUBECONFIG set.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects() as issue_command, mock.patch.object( kubectl, 'RunKubectlCommand' @@ -266,7 +279,7 @@ def testGetCredentials(self): cluster._Create() cluster._PostCreate() self.assertIn( - 'gcloud container clusters get-credentials pkb-{}'.format(_RUN_URI), + f'gcloud container clusters get-credentials pkb-{_RUN_URI}', issue_command.all_commands, ) self.assertIn( @@ -282,7 +295,7 @@ def testDelete(self): cluster._Delete() self.assertEqual(issue_command.func_to_mock.call_count, 5) self.assertIn( - 'gcloud container clusters delete pkb-{}'.format(_RUN_URI), + f'gcloud container clusters delete pkb-{_RUN_URI}', issue_command.all_commands, ) self.assertIn('--zone us-central1-a', issue_command.all_commands) @@ -293,11 +306,12 @@ def testExists(self): cluster = google_kubernetes_engine.GkeCluster(spec) cluster._Exists() self.assertIn( - 'gcloud container clusters describe pkb-{}'.format(_RUN_URI), + f'gcloud container clusters describe pkb-{_RUN_URI}', issue_command.all_commands, ) def testGetResourceMetadata(self): + """Tests that GetResourceMetadata returns all expected fields.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects(stdout=_KUBECTL_VERSION): cluster = google_kubernetes_engine.GkeCluster(spec) @@ -328,9 +342,11 @@ def testCidrCalculations(self): class GoogleKubernetesEngineAutoscalingTestCase(PatchedObjectsTestCase): + """Tests for GKE cluster creation with cluster-level autoscaling.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE spec with cluster-level autoscaling enabled.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -350,6 +366,7 @@ def create_kubernetes_engine_spec(): return kubernetes_engine_spec def testCreate(self): + """Tests that _Create passes autoscaling flags to gcloud.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -364,6 +381,7 @@ def testCreate(self): self.assertIn('--cluster-ipv4-cidr /18', issue_command.all_commands) def testGetResourceMetadata(self): + """Tests that metadata includes autoscaling size fields.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects(stdout=_KUBECTL_VERSION): cluster = google_kubernetes_engine.GkeCluster(spec) @@ -393,9 +411,11 @@ def testLabelDisks(self): class GoogleKubernetesEngineVersionFlagTestCase(PatchedObjectsTestCase): + """Tests for GKE cluster creation with version and release-channel flags.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE spec for testing version and release-channel flags.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -443,9 +463,11 @@ def testCreateRapidChannel(self): class GoogleKubernetesEngineGvnicFlagTestCase(PatchedObjectsTestCase): + """Tests for GKE cluster creation with gVNIC enable/disable flags.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE spec for testing the gVNIC flag.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -478,9 +500,11 @@ def testCreateDisableGvnic(self): class GoogleKubernetesEngineWithGpusTestCase(PatchedObjectsTestCase): + """Tests for GKE cluster creation with GPU accelerator configuration.""" @staticmethod def create_kubernetes_engine_spec(gpu_type): + """Creates a GKE spec with the given GPU type and 2 GPUs.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -501,6 +525,7 @@ def create_kubernetes_engine_spec(gpu_type): @flagsaver.flagsaver(gke_gpu_driver_version='latest') def testCreate(self): + """Tests that _Create includes the correct --accelerator flag for K80.""" spec = self.create_kubernetes_engine_spec('k80') with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -525,16 +550,19 @@ def testCreateGpuH100(self): cluster._Create() self.assertIn( '--accelerator ' - 'type=nvidia-h100-80gb,count=2,gpu-driver-version=default', + + 'type=nvidia-h100-80gb,count=2,gpu-driver-version=default', issue_command.all_commands, ) class GoogleKubernetesEngineGetNodesTestCase(GoogleKubernetesEngineTestCase): + """Tests for GKE node/instance-group enumeration methods.""" def testGetInstanceGroups(self): + """Tests that _GetInstanceGroups parses node-pools list output.""" path = os.path.join(os.path.dirname(__file__), _NODE_POOLS_LIST_OUTPUT) - output = open(path).read() + with open(path) as f: + output = f.read() spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects(stdout=output) as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -552,9 +580,13 @@ def testGetInstanceGroups(self): self.assertEqual(expected, set(instance_groups)) # order doesn't matter def testGetNodePoolNames(self): - output = ['default-pool', 'nap-e2-standard-2-iu4vquho', 'test-pool'] + """Tests that GetNodePoolNames returns names from cluster describe.""" + pool_names = ['default-pool', 'nap-e2-standard-2-iu4vquho', 'test-pool'] + json_output = json.dumps( + {'nodePools': [{'name': n} for n in pool_names]} + ) spec = self.create_kubernetes_engine_spec() - with self.patch_critical_objects(stdout='\n'.join(output)) as issue_command: + with self.patch_critical_objects(stdout=json_output) as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) node_pools = cluster.GetNodePoolNames() @@ -562,8 +594,8 @@ def testGetNodePoolNames(self): 'gcloud container clusters describe ' + cluster.name, issue_command.all_commands, ) - self.assertIn('--flatten', issue_command.all_commands) - self.assertIn('--format', issue_command.all_commands) + self.assertIn('--format json', issue_command.all_commands) + self.assertNotIn('--flatten', issue_command.all_commands) expected = { 'default-pool', @@ -574,9 +606,11 @@ def testGetNodePoolNames(self): class GoogleKubernetesEngineRegionalTestCase(PatchedObjectsTestCase): + """Tests for GKE regional cluster creation with multiple nodepools.""" @staticmethod def create_kubernetes_engine_spec(use_zonal_nodepools=False): + """Creates a regional GKE spec with two nodepools.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -619,6 +653,7 @@ def create_kubernetes_engine_spec(use_zonal_nodepools=False): return kubernetes_engine_spec def testCreateRegionalCluster(self): + """Tests regional cluster creation with region-wide nodepools.""" spec = self.create_kubernetes_engine_spec(use_zonal_nodepools=False) with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -668,6 +703,7 @@ def testCreateRegionalCluster(self): self.assertNotIn('--node-locations', create_nodepool2) def testCreateRegionalClusterZonalNodepool(self): + """Tests regional cluster creation with zone-pinned nodepools.""" spec = self.create_kubernetes_engine_spec(use_zonal_nodepools=True) with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeCluster(spec) @@ -706,9 +742,11 @@ def testCreateRegionalClusterZonalNodepool(self): class GoogleKubernetesEngineMachineFamiliesTestCase(PatchedObjectsTestCase): + """Tests for GKE nodepool creation with machine-family constraints.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE spec with a nodepool using machine families.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -735,6 +773,7 @@ def create_kubernetes_engine_spec(): return kubernetes_engine_spec def testCreateWithMachineFamilies(self): + """Tests that machine-family nodepool issues a node-pools update command.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects() as issue_command, mock.patch.object( kubernetes_commands, 'ApplyYaml' @@ -752,9 +791,11 @@ def testCreateWithMachineFamilies(self): class GoogleKubernetesEngineAutopilotTestCase(PatchedObjectsTestCase): + """Tests for GKE Autopilot cluster creation and metadata.""" @staticmethod def create_kubernetes_engine_spec(): + """Creates a GKE Autopilot cluster spec.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -771,6 +812,7 @@ def create_kubernetes_engine_spec(): return kubernetes_engine_spec def testCreate(self): + """Tests Autopilot _Create uses create-auto without node flags.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects() as issue_command: cluster = google_kubernetes_engine.GkeAutopilotCluster(spec) @@ -788,6 +830,7 @@ def testCreate(self): self.assertNotIn('--num-nodes', issue_command.all_commands) def testGetResourceMetadata(self): + """Tests that Autopilot metadata includes Auto values for size/type.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects(): cluster = google_kubernetes_engine.GkeAutopilotCluster(spec) @@ -818,8 +861,9 @@ def testGetResourceMetadataIncludesReleaseChannel(self): metadata, ) - @flagsaver.flagsaver(run_uri='123') + @flagsaver.flagsaver(gpu_type='h100', gpu_count=1, run_uri='123') def testApplyYamlGpusH100(self): + """Tests Autopilot YAML generation for H100 GPU node selectors.""" self.enter_context( mock.patch( gce_network.__name__ + '.GceFirewall.GetFirewall', @@ -858,8 +902,6 @@ def testApplyYamlGpusH100(self): ) ) spec = self.create_kubernetes_engine_spec() - spec.vm_spec.gpu_count = 1 - spec.vm_spec.gpu_type = 'h100' with self.assertLogs(level='INFO') as logs: cluster = google_kubernetes_engine.GkeAutopilotCluster(spec) yamls = kubernetes_commands.ConvertManifestToYamlDicts( @@ -884,6 +926,7 @@ def testApplyYamlGpusH100(self): self.assertIn('cloud.google.com/compute-class: Accelerator', full_logs) def testGetMachineTypeFromNodeName(self): + """Tests GetMachineTypeFromNodeName queries kubectl for node type.""" spec = self.create_kubernetes_engine_spec() with self.patch_critical_objects(): cluster = google_kubernetes_engine.GkeAutopilotCluster(spec) @@ -899,8 +942,10 @@ def testGetMachineTypeFromNodeName(self): class GoogleKubernetesEngineNodepoolAutoscalingTestCase(PatchedObjectsTestCase): + """Tests GKE per-nodepool autoscaling overrides cluster-level settings.""" def testCreateWithPerNodepoolAutoscaling(self): + """Tests per-nodepool autoscaling settings override cluster defaults.""" kubernetes_engine_spec = container_spec.ContainerClusterSpec( 'NAME', **{ @@ -949,5 +994,298 @@ def testCreateWithPerNodepoolAutoscaling(self): self.assertIn('--max-nodes 10', nodepool_cmd) +class GkeManagementPlaneTestCase(PatchedObjectsTestCase): + """Tests for GKE management-plane methods (k8s_management_benchmark).""" + + @staticmethod + def create_kubernetes_engine_spec(): + """Creates a GKE spec for management-plane method tests.""" + return container_spec.ContainerClusterSpec( + 'NAME', + **{ + 'cloud': 'GCP', + 'vm_spec': { + 'GCP': { + 'machine_type': 'fake-machine-type', + 'zone': 'us-central1-a', + }, + }, + 'vm_count': 2, + 'poll_for_events': False, + }, + ) + + def _make_nodepool_config(self, name='pkbpool0'): + """Returns a minimal BaseNodePoolConfig-like object.""" + cfg = mock.MagicMock() + cfg.name = name + cfg.num_nodes = 1 + cfg.machine_type = 'n1-standard-2' + cfg.disk_size = 100 + cfg.max_local_disks = 0 + cfg.zone = None + return cfg + + # ---- GetNodePoolNames (JSON format) --------------------------------------- + + def testGetNodePoolNamesJsonFormat(self): + """GetNodePoolNames parses JSON cluster describe output.""" + cluster_json = ( + '{"nodePools": [{"name": "default-pool"}, {"name": "extra-pool"}]}' + ) + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout=cluster_json) as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + names = cluster.GetNodePoolNames() + + self.assertIn( + 'gcloud container clusters describe ' + cluster.name, + issue_command.all_commands, + ) + self.assertIn('--format', issue_command.all_commands) + # Must NOT use --flatten (old format) + self.assertNotIn('--flatten', issue_command.all_commands) + self.assertEqual({'default-pool', 'extra-pool'}, set(names)) + + def testGetNodePoolNamesEmptyFallback(self): + """GetNodePoolNames falls back to split() on non-JSON output.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='pool-a pool-b'): + cluster = google_kubernetes_engine.GkeCluster(spec) + names = cluster.GetNodePoolNames() + self.assertEqual({'pool-a', 'pool-b'}, set(names)) + + # ---- CreateNodePool ------------------------------------------------------- + + def testCreateNodePool(self): + """CreateNodePool issues gcloud node-pools create with cluster flag.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('mypool') + cluster.CreateNodePool(cfg) + + cmd = issue_command.GetCommandWithSubstring('node-pools create mypool') + self.assertIn('--cluster', cmd) + self.assertNotIn('--node-version', cmd) + + def testCreateNodePoolWithVersion(self): + """CreateNodePool passes --node-version when provided.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('mypool') + cluster.CreateNodePool(cfg, node_version='1.34.1-gke.100') + + cmd = issue_command.GetCommandWithSubstring('node-pools create mypool') + self.assertIn('--node-version 1.34.1-gke.100', cmd) + + # ---- DeleteNodePool ------------------------------------------------------- + + def testDeleteNodePool(self): + """DeleteNodePool issues gcloud node-pools delete with --quiet.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster.DeleteNodePool('old-pool') + + cmd = issue_command.GetCommandWithSubstring('node-pools delete old-pool') + self.assertIn('--cluster', cmd) + self.assertIn('--quiet', cmd) + + # ---- UpgradeNodePool ------------------------------------------------------ + + def testUpgradeNodePool(self): + """UpgradeNodePool issues gcloud clusters upgrade with --node-pool.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster.UpgradeNodePool('my-pool', '1.34.1-gke.200') + + cmd = issue_command.GetCommandWithSubstring('clusters upgrade') + self.assertIn('--node-pool my-pool', cmd) + self.assertIn('--cluster-version 1.34.1-gke.200', cmd) + self.assertIn('--quiet', cmd) + + # ---- UpdateCluster -------------------------------------------------------- + + def testUpdateCluster(self): + """UpdateCluster issues gcloud clusters update with a timestamp label.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster.UpdateCluster() + + cmd = issue_command.GetCommandWithSubstring('clusters update') + self.assertIn('--update-labels', cmd) + self.assertIn('k8s-mgmt-ts=', cmd) + + # ---- Async variants ------------------------------------------------------- + + def testCreateNodePoolAsyncReturnsOpName(self): + """CreateNodePoolAsync returns the GKE operation name.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects( + stdout='extra line\noperation-1234\n' + ) as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('asyncpool') + handle = cluster.CreateNodePoolAsync(cfg) + + cmd = issue_command.GetCommandWithSubstring('node-pools create asyncpool') + self.assertIn('--async', cmd) + self.assertNotIn('--timeout', cmd) + self.assertEqual('operation-1234', handle) + + def testCreateNodePoolAsyncWithVersion(self): + """CreateNodePoolAsync passes --node-version when provided.""" + spec = self.create_kubernetes_engine_spec() + stdout = 'operation-5678\n' + with self.patch_critical_objects(stdout=stdout) as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('verpool') + cluster.CreateNodePoolAsync(cfg, node_version='1.33.5-gke.1') + + cmd = issue_command.GetCommandWithSubstring('node-pools create verpool') + self.assertIn('--node-version 1.33.5-gke.1', cmd) + + def testDeleteNodePoolAsyncReturnsOpName(self): + """DeleteNodePoolAsync issues delete with --async.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='operation-del\n') as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + handle = cluster.DeleteNodePoolAsync('to-delete') + + cmd = issue_command.GetCommandWithSubstring('node-pools delete to-delete') + self.assertIn('--async', cmd) + self.assertIn('--quiet', cmd) + self.assertEqual('operation-del', handle) + + def testUpgradeNodePoolAsyncReturnsOpName(self): + """UpgradeNodePoolAsync issues upgrade with --async.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='operation-upg\n') as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + handle = cluster.UpgradeNodePoolAsync('my-pool', '1.34.2-gke.100') + + cmd = issue_command.GetCommandWithSubstring('clusters upgrade') + self.assertIn('--async', cmd) + self.assertIn('--node-pool my-pool', cmd) + self.assertIn('--cluster-version 1.34.2-gke.100', cmd) + self.assertEqual('operation-upg', handle) + + def testUpdateClusterAsyncReturnsOpName(self): + """UpdateClusterAsync issues clusters update with --async.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='operation-upd\n') as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + handle = cluster.UpdateClusterAsync() + + cmd = issue_command.GetCommandWithSubstring('clusters update') + self.assertIn('--async', cmd) + self.assertIn('k8s-mgmt-ts=', cmd) + self.assertEqual('operation-upd', handle) + + def testIssueAsyncRaisesOnNonZeroRetcode(self): + """_IssueAsync raises CreationError when the command fails.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stderr='boom', return_code=1): + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('failpool') + with self.assertRaises(Exception): + cluster.CreateNodePoolAsync(cfg) + + def testIssueAsyncRaisesOnEmptyOpName(self): + """_IssueAsync raises CreationError when stdout produces no op name.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout=' \n '): + cluster = google_kubernetes_engine.GkeCluster(spec) + cfg = self._make_nodepool_config('emptypool') + with self.assertRaises(Exception): + cluster.CreateNodePoolAsync(cfg) + + # ---- WaitForOperation ----------------------------------------------------- + + def testWaitForOperationDone(self): + """WaitForOperation returns immediately when status is DONE.""" + spec = self.create_kubernetes_engine_spec() + done_json = '{"status": "DONE"}' + with self.patch_critical_objects(stdout=done_json): + cluster = google_kubernetes_engine.GkeCluster(spec) + # Should not raise + cluster.WaitForOperation('operation-xyz') + + def testWaitForOperationAbortingRaises(self): + """WaitForOperation raises CreationError when status is ABORTING.""" + spec = self.create_kubernetes_engine_spec() + aborted_json = '{"status": "ABORTING"}' + with self.patch_critical_objects(stdout=aborted_json): + cluster = google_kubernetes_engine.GkeCluster(spec) + with self.assertRaises(errors.Resource.CreationError): + cluster.WaitForOperation('operation-bad') + + # ---- ResolveNodePoolVersions ---------------------------------------------- + + def testResolveNodePoolVersions(self): + """ResolveNodePoolVersions returns (N-1 qualified, N qualified).""" + server_config = { + 'validNodeVersions': [ + '1.34.5-gke.100', + '1.34.3-gke.50', + '1.33.8-gke.200', + '1.33.5-gke.99', + '1.32.1-gke.10', + ] + } + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects( + stdout=json.dumps(server_config) + ) as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + initial, target = cluster.ResolveNodePoolVersions() + + cmd = issue_command.GetCommandWithSubstring('get-server-config') + self.assertIn('--format', cmd) + # target = newest overall = 1.34.5-gke.100 + self.assertEqual('1.34.5-gke.100', target) + # initial = best version for minor 33 = 1.33.8-gke.200 + self.assertEqual('1.33.8-gke.200', initial) + + def testResolveNodePoolVersionsNoVersionsRaises(self): + """ResolveNodePoolVersions raises GetError when versions list is empty.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='{"validNodeVersions": []}'): + cluster = google_kubernetes_engine.GkeCluster(spec) + with self.assertRaises(errors.Resource.GetError): + cluster.ResolveNodePoolVersions() + + # ---- HasActiveUpgradeOperations ------------------------------------------- + + def testHasActiveUpgradeOperationsTrue(self): + """HasActiveUpgradeOperations returns True when an upgrade is running.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='operation-upgrade-123\n'): + cluster = google_kubernetes_engine.GkeCluster(spec) + self.assertTrue(cluster.HasActiveUpgradeOperations()) + + def testHasActiveUpgradeOperationsFalse(self): + """HasActiveUpgradeOperations returns False when no upgrade is running.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout=''): + cluster = google_kubernetes_engine.GkeCluster(spec) + self.assertFalse(cluster.HasActiveUpgradeOperations()) + + def testHasActiveUpgradeOperationsUsesCorrectFilter(self): + """HasActiveUpgradeOperations queries for UPGRADE_NODES AND RUNNING.""" + spec = self.create_kubernetes_engine_spec() + with self.patch_critical_objects(stdout='') as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster.HasActiveUpgradeOperations() + + self.assertIn('operations list', issue_command.all_commands) + self.assertIn('UPGRADE_NODES', issue_command.all_commands) + self.assertIn('RUNNING', issue_command.all_commands) + + if __name__ == '__main__': unittest.main()