diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 5bfdc81cb3..0000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,160 +0,0 @@
-# copybara:strip_begin(internal)
-# This is based on http://google3/devtools/gpylint/config/base/rc
-# copybara:strip_end
-
-
-# Default configuration for pylint, which should pass for all (incremental) changes.
-# See CONTRIBUTING.md for more.
-
-[MESSAGES CONTROL]
-# List of checkers and warnings to enable.
-enable=indexing-exception,old-raise-syntax
-
-disable=abstract-method,
-        attribute-defined-outside-init,
-        bad-option-value,
-        c-extension-no-member,
-        design,
-        file-ignored,
-        fixme,
-        global-statement,
-        invalid-metaclass,
-        locally-disabled,
-        locally-enabled,
-        misplaced-comparison-constant,
-        no-else-break,
-        no-else-continue,
-        no-else-raise,
-        no-else-return,
-        no-self-use,
-        pointless-except,
-        redundant-u-string-prefix,
-        similarities,
-        star-args,
-        suppressed-message,
-        trailing-newlines,
-        ungrouped-imports,
-        unnecessary-pass,
-        unspecified-encoding,
-        unsubscriptable-object,
-        useless-else-on-loop,
-        useless-object-inheritance,
-        useless-suppression,
-
-[BASIC]
-
-# Regular expression which should only match the name
-# of functions or classes which do not require a docstring.
-no-docstring-rgx=(__.*__|main)
-
-# Min length in lines of a function that requires a docstring.
-docstring-min-length=12
-
-# Regular expression which should only match correct module names. The
-# leading underscore is sanctioned for private modules by Google's style
-# guide.
-#
-# There are exceptions to the basic rule (_?[a-z][a-z0-9_]*) to cover
-# requirements of Python's module system and of the presubmit framework.
-module-rgx=^(_?[a-z][a-z0-9_]*)|__init__|PRESUBMIT|PRESUBMIT_unittest$
-
-# Regular expression which should only match correct module level names
-const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression which should only match correct class attribute
-class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression which should only match correct class names
-class-rgx=^_?[A-Z][a-zA-Z0-9]*$
-
-# Regular expression which should only match correct function names.
-# 'PascalCase' and 'snake_case' group names are used for consistency of naming
-# styles across functions and methods.
-function-rgx=^(?:(?P<PascalCase>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
-
-# Regular expression which should only match correct method names.
-# 'PascalCase' and 'snake_case' group names are used for consistency of naming
-# styles across functions and methods. 'exempt' indicates a name which is
-# consistent with all naming styles.
-method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|next)|(?P<PascalCase>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
-
-# Regular expression which should only match correct instance attribute names
-attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
-
-# Regular expression which should only match correct argument names
-argument-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression which should only match correct variable names
-variable-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression which should only match correct list comprehension /
-# generator expression variable names
-inlinevar-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression which should only match correct TypeVar names
-typevar-rgx=^_{0,2}(?:[^\W\da-z_]+|(?:[^\W\da-z_]+[^\WA-Z_]+)+T?)(?:_co(?:ntra)?)?$
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=main,_
-
-# List of decorators that define properties, such as abc.abstractproperty.
-property-classes=abc.abstractproperty,functools.cached_property,google3.pyglib.function_utils.cached.property,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl,werkzeug.utils.cached_property
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching names used for dummy variables (i.e. not used).
-dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of modules that are allowed to redefine builtins.
-redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
-
-[STRING]
-
-# This flag controls whether the implicit-str-concat should
-# generate a warning on implicit string concatenation in sequences defined over
-# several lines.
-check-str-concat-over-line-jumps=yes
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,__new__,setUp
-
-# "class_" is also a valid for the first argument to a class method.
-valid-classmethod-first-arg=cls,class_
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=80
-
-# Regexp for a line that is allowed to be longer than the limit.
-# This "ignore" regex is today composed of:
-# (1) p4 expansion $Id$ lines
-# (2) Depot paths for go/ifthisthenthatlint directives.
-# (3) Long string constants not containing whitespaces. This is needed now we
-#     have switched Pyformat to use Pyink, and it would wrap strings constants
-#     with a narrow range of lengths (less than 80 - indentation) in parens.
-#     This causes GPylint to complain otherwise allowed per
-#     go/pystyle#line-length. See b/262137806 for more information.
-# Other lines might be allowed to be long by gpylint.pyformat_filter: see that
-# module for more information.
-ignore-long-lines=(?x)(\$Id:\s\/\/depot\/.+\#\d+\s\$|^\s*\#\ LINT\.ThenChange|^\s*\w+\ =\ (?P<quote>['"])\S+(?P=quote)$)
-
-# Maximum number of lines in a module
-max-module-lines=99999
-
-# String used as indentation unit. We differ from PEP8's normal 4 spaces.
-indent-string='  '
-
-# Do not warn about multiple statements on a single line for constructs like
-#   if test: stmt
-single-line-if-stmt=y
diff --git a/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py b/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py
new file mode 100644
index 0000000000..dbf07127ae
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/kubernetes_management_benchmark.py
@@ -0,0 +1,792 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmark for Kubernetes management plane operations.
+
+Measures GKE/EKS/AKS control-plane API responsiveness via three scenarios:
+  A. Concurrent node-pool create/upgrade/delete.
+  B. Node-pool create overlapping with a long-running cluster update.
+  C. Large-scale node-pool provisioning (single scale or sweep).
+
+Optimizations for minimum run time:
+  - Streaming concurrency in Scenario C (no batch barriers)
+  - Optional pipelined Scenario A (create->upgrade->delete per thread)
+  - Reduced poll_interval in provider WaitForOperation (5s vs 10s)
+  - Per-op threads capped at _MAX_CONCURRENT to avoid OS limits
+  - Accurate delete success rate via attempted_ops denominator
+"""
+
+import copy
+import dataclasses
+import statistics
+import threading
+import time
+from typing import Callable
+from unicodedata import name
+
+from absl import flags
+from absl import logging
+from perfkitbenchmarker import background_tasks
+from perfkitbenchmarker import benchmark_spec as bm_spec
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import sample
+from perfkitbenchmarker.configs import benchmark_config_spec
+from perfkitbenchmarker.resources.container_service import (
+    container as container_lib)
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.resources.container_service import kubernetes_cluster
+
+_SLEEP_POD_NAME = 'pkb-mgmt-sleep'
+
+BENCHMARK_NAME = 'kubernetes_management'
+
+BENCHMARK_CONFIG = """
+kubernetes_management:
+  description: >
+    Benchmarks GKE/EKS/AKS management plane operations: concurrent node pool
+    create/upgrade/delete, overlapping cluster + node-pool ops, and large-scale
+    provisioning. Focused on control-plane API responsiveness.
+    Spec regions: GCP us-central1, AWS us-east-1 (closest), Azure eastus.
+    Equivalent machine types across clouds per Google benchmark spec.
+  container_cluster:
+    type: Kubernetes
+    vm_count: 1
+    vm_spec:
+      GCP:
+        # us-central1-a: spec primary region for GCP
+        # e2-standard-2: 2 vCPU 8GB — equivalent to t3.medium / D2s_v3
+        machine_type: e2-standard-2
+        zone: us-central1-a
+      AWS:
+        # us-east-1a: closest comparable region to GCP us-central1
+        # t3.medium: 2 vCPU 4GB — closest equivalent to e2-standard-2
+        machine_type: t3.medium
+        zone: us-east-1a
+      Azure:
+        # eastus: closest comparable region to GCP us-central1
+        # Standard_D2s_v3: 2 vCPU 8GB — equivalent to e2-standard-2
+        machine_type: Standard_D2s_v3
+        zone: eastus
+"""
+
+_VALID_SCENARIOS = frozenset({'A', 'B', 'C'})
+
+_CONCURRENT_NODEPOOLS = flags.DEFINE_integer(
+    'k8s_mgmt_concurrent_nodepools',
+    5,
+    'Number of node pools to create/upgrade/delete concurrently in Scenario A.',
+)
+_LARGE_SCALE_NODEPOOLS = flags.DEFINE_integer(
+    'k8s_mgmt_large_scale_nodepools',
+    1000,
+    'Number of node pools to provision in the large-scale Scenario C. '
+    + 'Spec target is 1000; ensure VPC/quota is available before running.',
+)
+_NODES_PER_NODEPOOL = flags.DEFINE_integer(
+    'k8s_mgmt_nodes_per_nodepool',
+    2,
+    'Number of nodes per node pool. Google spec: 2 nodes per pool.',
+)
+_INITIAL_VERSION = flags.DEFINE_string(
+    'k8s_mgmt_initial_version',
+    None,
+    'Kubernetes version for newly-created node pools (N-1). None = auto.',
+)
+_TARGET_VERSION = flags.DEFINE_string(
+    'k8s_mgmt_target_version',
+    None,
+    'Kubernetes version to upgrade node pools to (N). None = cluster version.',
+)
+_SCENARIOS = flags.DEFINE_list(
+    'k8s_mgmt_scenarios',
+    ['A', 'B', 'C'],
+    'Comma-separated subset of scenarios to run. Valid values: A, B, C.',
+)
+_SCALE_SWEEP = flags.DEFINE_list(
+    'k8s_mgmt_scale_sweep',
+    [],
+    'Comma-separated list of node-pool counts for Scenario C scale sweep. '
+    + 'Each scale runs as a separate sub-run with full create/delete cycle. '
+    + 'Example: --k8s_mgmt_scale_sweep=10,50,100,500,1000. '
+    + 'If empty, uses --k8s_mgmt_large_scale_nodepools.',
+)
+_MAX_CONCURRENT = flags.DEFINE_integer(
+    'k8s_mgmt_max_concurrent',
+    50,
+    'Cap on concurrent provider API calls within a batch. '
+    + 'Higher = faster but more aggressive on connection pools.',
+)
+_PIPELINE_SCENARIO_A = flags.DEFINE_boolean(
+    'k8s_mgmt_pipeline_scenario_a',
+    True,
+    'If True, run Scenario A as per-pool pipeline (create->upgrade->delete '
+    + 'back-to-back per thread). Minimizes wall time. '
+    + 'Default False for spec-strict phase-by-phase.',
+)
+
+# AKS caps node-pool names at 12 chars — keep all names within that limit.
+_PREFIX = 'pkbm'
+
+
+def _ScenarioAName(i):
+  return f'{_PREFIX}a{i:03d}'
+
+
+_SCENARIO_B_NAME = f'{_PREFIX}b'
+
+
+def _ScenarioCName(i):
+  return f'{_PREFIX}c{i:04d}'
+
+@dataclasses.dataclass
+class _OpResult:
+  """Holds timing and outcome for a single async management-plane operation."""
+  name: str
+  init_dur: float
+  e2e_dur: float
+  error: Exception | None = None
+
+
+def GetConfig(user_config):
+  return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def CheckPrerequisites(
+    benchmark_config: benchmark_config_spec.BenchmarkConfigSpec,):
+  """Validates flag values and cluster type before any cloud calls."""
+  invalid = [
+      s for s in _SCENARIOS.value if s.strip() not in _VALID_SCENARIOS
+  ]
+  if invalid:
+    raise errors.Config.InvalidValue(
+        f'Invalid value(s) for --k8s_mgmt_scenarios: {invalid}. '
+        + f'Valid options: {sorted(_VALID_SCENARIOS)}.')
+  for s in _SCALE_SWEEP.value:
+    try:
+      int(s.strip())
+    except ValueError as e:
+      raise errors.Config.InvalidValue(
+          f'Non-integer value in --k8s_mgmt_scale_sweep: {s!r}') from e
+  if benchmark_config.container_cluster.type != 'Kubernetes':
+    raise errors.Config.InvalidValue(
+        'kubernetes_management benchmark requires a Kubernetes'
+        + ' container cluster.')
+
+
+def Prepare(benchmark_spec: bm_spec.BenchmarkSpec) -> None:
+  """Asserts the cluster is reachable; deploys spec-defined sleep workload."""
+  cluster = benchmark_spec.container_cluster
+  assert isinstance(cluster, kubernetes_cluster.KubernetesCluster)
+  benchmark_spec.always_call_cleanup = True
+  logging.info(
+      'kubernetes_management Prepare: cluster=%s, version=%s',
+      cluster.name,
+      cluster.k8s_version,
+  )
+  # Spec workload: "a simple container that sleeps for a given time".
+  # Confirms data-plane reachability; generates no data-plane load.
+  kubectl.RunKubectlCommand(
+    [
+        'run',
+        _SLEEP_POD_NAME,
+        '--image=busybox',
+        '--restart=Never',
+        '--',
+        'sleep',
+        '86400',
+    ],
+  )
+
+def _CleanStartSweep(cluster: kubernetes_cluster.KubernetesCluster) -> None:
+  """Deletes any stale pkbm* node pools so each run starts clean (spec C.2)."""
+  try:
+    stale = [
+        n for n in cluster.GetNodePoolNames() if n.startswith(_PREFIX)
+    ]
+  except Exception:  # pylint: disable=broad-except
+    logging.exception('CleanStart: failed to list node pools')
+    return
+  if not stale:
+    logging.info(
+        'CleanStart: no stale pools found — clean start confirmed.')
+    return
+  logging.warning('CleanStart: deleting %d stale pools: %s', len(stale),
+                  stale)
+  background_tasks.RunThreaded(cluster.DeleteNodePool, stale)
+
+
+def Run(benchmark_spec: bm_spec.BenchmarkSpec) -> list[sample.Sample]:
+  """Runs the selected scenarios and returns flat list of samples."""
+  cluster = benchmark_spec.container_cluster
+  assert isinstance(cluster, kubernetes_cluster.KubernetesCluster)
+
+  # Spec C.2: start clean.
+  _CleanStartSweep(cluster)
+
+  # Resolve versions once; log clearly; tag every sample.
+  # Google spec: initial=N-1, target=N (adjacent minor upgrade).
+  flag_initial = _INITIAL_VERSION.value
+  flag_target = _TARGET_VERSION.value
+  if flag_initial and flag_target:
+    initial, target = flag_initial, flag_target
+    source = 'flags'
+  else:
+    resolved_initial, resolved_target = cluster.ResolveNodePoolVersions()
+    initial = flag_initial or resolved_initial
+    target = flag_target or resolved_target
+    source = 'auto-resolved' if not (flag_initial or flag_target) else 'mixed'
+
+  logging.info(
+      'NodePool versions (%s): initial=%s -> target=%s '
+      + '(cluster k8s_version=%s) | nodes_per_pool=%d | machine_type=%s',
+      source,
+      initial,
+      target,
+      cluster.k8s_version,
+      _NODES_PER_NODEPOOL.value,
+      cluster.default_nodepool.machine_type
+      if hasattr(cluster, 'default_nodepool') else 'unknown',
+  )
+
+  scenarios = {s.strip().upper() for s in _SCENARIOS.value}
+  samples: list[sample.Sample] = []
+
+  if 'A' in scenarios:
+    samples += _RunScenarioA(cluster, initial, target)
+  if 'B' in scenarios:
+    samples += _RunScenarioB(cluster, initial)
+  if 'C' in scenarios:
+    # fix: Scenario A/B pools may still be in Deleting state and count
+    # toward AKS's 100-pool cluster limit.  Sweep them out before Scenario C
+    # so we don't hit MaxAgentPoolCountReached mid-run.
+    _CleanStartSweep(cluster)
+    scales = ([int(x.strip()) for x in _SCALE_SWEEP.value]
+              if _SCALE_SWEEP.value else [_LARGE_SCALE_NODEPOOLS.value])
+    logging.info('Scenario C: scale sweep = %s', scales)
+    for scale in scales:
+      scenario_c_samples = _RunScenarioC(cluster, initial, scale)
+      for s in scenario_c_samples:
+        s.metadata['scenario_c_scale'] = str(scale)
+      samples += scenario_c_samples
+
+  # Tag all samples with version path and run config for published results.
+  run_meta = {
+      'initial_version': str(initial),
+      'target_version': str(target),
+      'cluster_k8s_version': str(cluster.k8s_version),
+      'nodes_per_nodepool': str(_NODES_PER_NODEPOOL.value),
+      'concurrent_nodepools': str(_CONCURRENT_NODEPOOLS.value),
+  }
+  for s in samples:
+    s.metadata.update(run_meta)
+
+  return samples
+
+
+def Cleanup(benchmark_spec: bm_spec.BenchmarkSpec) -> None:
+  """Best-effort delete of leftover benchmark node pools and sleep pod."""
+  cluster = benchmark_spec.container_cluster
+  if cluster is None:
+    return
+  kubectl.RunKubectlCommand(
+      ['delete', 'pod', _SLEEP_POD_NAME, '--ignore-not-found'],
+      raise_on_failure=False,
+  )
+  try:
+    leftover = [
+        n for n in cluster.GetNodePoolNames() if n.startswith(_PREFIX)
+    ]
+  except Exception:  # pylint: disable=broad-except
+    logging.exception('Cleanup: failed to list node pools')
+    return
+  if not leftover:
+    return
+  logging.info('Cleanup: deleting %d leftover node pools', len(leftover))
+  background_tasks.RunThreaded(cluster.DeleteNodePool, leftover)
+
+
+# ---------------------------------------------------------------------------
+# Scenario A
+# ---------------------------------------------------------------------------
+
+
+def _RunScenarioA(
+    cluster: kubernetes_cluster.KubernetesCluster,
+    initial: str,
+    target: str,
+) -> list[sample.Sample]:
+  """Concurrent CreateNodePool, UpgradeNodePool, DeleteNodePool."""
+  n = _CONCURRENT_NODEPOOLS.value
+  if _PIPELINE_SCENARIO_A.value:
+    logging.info(
+        'Scenario A (pipelined): %d pools, initial=%s, target=%s',
+        n, initial, target)
+    return _RunScenarioAPipelined(cluster, n, initial, target)
+
+  logging.info(
+      'Scenario A (phase-by-phase): %d pools, initial=%s, target=%s',
+      n, initial, target)
+  pool_names = [_ScenarioAName(i) for i in range(n)]
+  configs_ = [_MakeNodePoolConfig(cluster, name) for name in pool_names]
+  samples: list[sample.Sample] = []
+
+  # ── Phase 1: concurrent creates ─────────────────────────────────────────
+  create_results = _RunAsync(
+      kickoff=lambda cfg: cluster.CreateNodePoolAsync(
+          cfg, node_version=initial),
+      wait_fn=cluster.WaitForOperation,
+      items=configs_,
+      get_name=lambda cfg: cfg.name,
+  )
+  samples += _OpSamples('ScenarioA_Create',
+                        create_results,
+                        attempted_ops=len(pool_names))
+
+  # ── Phase 2: concurrent upgrades (only successfully created pools) ───────
+  created = [r.name for r in create_results if r.error is None]
+  logging.info(
+      'Scenario A: %d/%d pools created — proceeding to upgrade',
+      len(created), n)
+  upgrade_results = _RunAsync(
+      kickoff=lambda name: cluster.UpgradeNodePoolAsync(name, target),
+      wait_fn=cluster.WaitForOperation,
+      items=created,
+      get_name=str,
+  )
+  samples += _OpSamples('ScenarioA_Upgrade',
+                        upgrade_results,
+                        attempted_ops=len(created))
+
+  # # ── Idiomatic Control Plane Synchronization Barrier ──────────────────────
+  # # Give the GKE control plane a brief window to register the async ops.
+  # time.sleep(15)
+
+  # # Check if the cluster object has our native upgrade tracking capability.
+  # if hasattr(cluster, 'HasActiveUpgradeOperations'):
+  #   logging.info('GCP GKE cluster detected; polling via provider API.')
+
+  #   while cluster.HasActiveUpgradeOperations():
+  #     logging.info(
+  #         'Upgrade operations active; holding delete phase for 30s.')
+  #     time.sleep(30)
+
+  #   logging.info(
+  #       'All upgrade ops completed; flushing API gateway write-locks.')
+  #   time.sleep(10)
+  # else:
+  #   # Non-GCP providers (Azure AKS / AWS EKS): standard safety pause.
+  #   logging.info(
+  #       'Non-GCP cluster; proceeding with stabilization pause.')
+  #   time.sleep(5)
+
+  # ── Phase 3: concurrent deletes (live-list to catch EKS rollbacks) ──────
+  alive = [
+      p for p in cluster.GetNodePoolNames() if p.startswith(f'{_PREFIX}a')
+  ]
+  logging.info(
+      'Scenario A: %d live pools found for delete (originally %d)',
+      len(alive), n)
+  delete_results = _RunAsync(
+      kickoff=cluster.DeleteNodePoolAsync,
+      wait_fn=cluster.WaitForOperation,
+      items=alive,
+      get_name=str,
+  )
+  # attempted_ops=n: success rate reflects original request, not just live.
+  # EKS rolls back timed-out pools silently — without this shows 100%.
+  samples += _OpSamples('ScenarioA_Delete', delete_results, attempted_ops=n)
+  return samples
+
+
+def _RunScenarioAPipelined(
+    cluster: kubernetes_cluster.KubernetesCluster,
+    n: int,
+    initial: str,
+    target: str,
+) -> list[sample.Sample]:
+  """Per-pool pipeline: create->upgrade->delete back-to-back per thread.
+
+  Minimizes wall time: max_i(create_i + upgrade_i + delete_i) vs
+  max(creates)+max(upgrades)+max(deletes) in phase-by-phase mode.
+  Trade-off: ops run under mixed-type concurrent load.
+  """
+  pool_names = [_ScenarioAName(i) for i in range(n)]
+  creates = _Results()
+  upgrades = _Results()
+  deletes = _Results()
+
+  def DoPool(name: str):
+    cfg = _MakeNodePoolConfig(cluster, name)
+    init, e2e, err = _TimedAsync(
+        lambda: cluster.CreateNodePoolAsync(cfg, node_version=initial),
+        cluster.WaitForOperation,
+    )
+    creates.add(name, init, e2e, err)
+    if err is not None:
+      return
+    init, e2e, err = _TimedAsync(
+        lambda: cluster.UpgradeNodePoolAsync(name, target),
+        cluster.WaitForOperation,
+    )
+    upgrades.add(name, init, e2e, err)
+    init, e2e, err = _TimedAsync(
+        lambda: cluster.DeleteNodePoolAsync(name),
+        cluster.WaitForOperation,
+    )
+    deletes.add(name, init, e2e, err)
+
+  background_tasks.RunThreaded(
+      DoPool,
+      pool_names,
+      max_concurrent_threads=min(n, _MAX_CONCURRENT.value),
+  )
+  samples: list[sample.Sample] = []
+  samples += _OpSamples('ScenarioA_Create', creates.entries, attempted_ops=n)
+  samples += _OpSamples('ScenarioA_Upgrade',
+                        upgrades.entries,
+                        attempted_ops=n)
+  samples += _OpSamples('ScenarioA_Delete', deletes.entries, attempted_ops=n)
+  return samples
+
+
+# ---------------------------------------------------------------------------
+# Scenario B
+# ---------------------------------------------------------------------------
+
+
+def _RunScenarioB(
+    cluster: kubernetes_cluster.KubernetesCluster,
+    initial: str,
+) -> list[sample.Sample]:
+  """CreateNodePool fired concurrently with a long-running cluster update.
+
+  Both ops kick off async on separate threads; initiation + E2E latency
+  recorded independently. Overlap window = ClusterUpdate E2E latency.
+  """
+  logging.info('Scenario B: overlapping cluster update + node-pool create')
+  cfg = _MakeNodePoolConfig(cluster, _SCENARIO_B_NAME)
+  results = _Results()
+
+  def DoClusterUpdate():
+    init, e2e, err = _TimedAsync(cluster.UpdateClusterAsync,
+                                 cluster.WaitForOperation)
+    results.add('ScenarioB_ClusterUpdate', init, e2e, err)
+    logging.info('Scenario B ClusterUpdate: init=%.2fs e2e=%.2fs ok=%s',
+                 init, e2e, err is None)
+
+  def DoCreate():
+    init, e2e, err = _TimedAsync(
+        lambda: cluster.CreateNodePoolAsync(cfg, node_version=initial),
+        cluster.WaitForOperation,
+    )
+    results.add('ScenarioB_NodePoolCreate', init, e2e, err)
+    logging.info('Scenario B NodePoolCreate: init=%.2fs e2e=%.2fs ok=%s',
+                 init, e2e, err is None)
+
+  background_tasks.RunThreaded(lambda fn: fn(),
+                               [DoClusterUpdate, DoCreate])
+
+  samples: list[sample.Sample] = []
+  for entry in results.entries:
+    samples += _OpSamples(entry.name, [entry], attempted_ops=1)
+
+  # Remove test pool (best-effort).
+  try:
+    cluster.DeleteNodePool(_SCENARIO_B_NAME)
+  except Exception:  # pylint: disable=broad-except
+    logging.exception('Scenario B: failed to delete test pool')
+  return samples
+
+
+# ---------------------------------------------------------------------------
+# Scenario C
+# ---------------------------------------------------------------------------
+
+
+def _RunScenarioC(
+    cluster: kubernetes_cluster.KubernetesCluster,
+    initial: str,
+    scale: int,
+) -> list[sample.Sample]:
+  """Large-scale node-pool provisioning at a given scale.
+
+  Streams all `scale` creates through a single executor capped at
+  _MAX_CONCURRENT workers — as each op completes the next starts immediately
+  (no batch barriers). Delete uses a live-list so EKS-rolled-back pools are
+  excluded from the denominator correctly.
+  """
+  logging.info(
+      'Scenario C: scale=%d, max_concurrent=%d, initial_version=%s',
+      scale,
+      _MAX_CONCURRENT.value,
+      initial,
+  )
+  pool_names = [_ScenarioCName(i) for i in range(scale)]
+  configs_ = [_MakeNodePoolConfig(cluster, name) for name in pool_names]
+  samples: list[sample.Sample] = []
+
+  # ── Creates ──────────────────────────────────────────────────────────────
+  create_results = _RunAsync(
+      kickoff=lambda cfg: cluster.CreateNodePoolAsync(
+          cfg, node_version=initial),
+      wait_fn=cluster.WaitForOperation,
+      items=configs_,
+      get_name=lambda cfg: cfg.name,
+  )
+  created_ok = sum(1 for r in create_results if r.error is None)
+  logging.info('Scenario C scale=%d: %d/%d creates succeeded', scale,
+               created_ok, scale)
+  samples += _OpSamples('ScenarioC_Create',
+                        create_results,
+                        attempted_ops=scale)
+
+  # ── Deletes (live-list) ──────────────────────────────────────────────────
+  alive = [
+      p for p in cluster.GetNodePoolNames() if p.startswith(f'{_PREFIX}c')
+  ]
+  logging.info(
+      'Scenario C scale=%d: %d live pools for delete (originally %d;'
+      + ' %d rolled back by cloud)',
+      scale,
+      len(alive),
+      scale,
+      scale - len(alive),
+  )
+  if not alive:
+    logging.warning(
+        'Scenario C scale=%d: 0 live pools — all timed-out creates were'
+        + ' rolled back. Recording 0%% delete success rate.', scale)
+    samples += _OpSamples('ScenarioC_Delete', [], attempted_ops=scale)
+    return samples
+
+  delete_results = _RunAsync(
+      kickoff=cluster.DeleteNodePoolAsync,
+      wait_fn=cluster.WaitForOperation,
+      items=alive,
+      get_name=str,
+  )
+  # attempted_ops=scale: accurate rate against original request count.
+  samples += _OpSamples('ScenarioC_Delete',
+                        delete_results,
+                        attempted_ops=scale)
+  return samples
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+class _Results:
+  """Thread-safe collector for (name, init_latency, e2e_latency, error)."""
+
+  def __init__(self):
+    self._lock = threading.Lock()
+    self.entries: list[_OpResult] = []
+
+  def add(self, name: str, init_dur: float, e2e_dur: float,
+        err: Exception | None) -> None:
+    result = _OpResult(name, init_dur, e2e_dur, err)
+    with self._lock:
+      self.entries.append(result)
+
+
+def _TimedAsync(
+    kickoff: Callable[[], str],
+    wait_fn: Callable[[str], None],
+) -> tuple[float, float, Exception | None]:
+  """Runs kickoff() then wait_fn(handle); returns (init_lat, e2e_lat, err).
+
+  init_lat = time for kickoff() to return (API accepted).
+  e2e_lat  = total wall time including wait. On kickoff failure both are set
+             to elapsed time at failure point.
+  """
+  init_start = time.monotonic()
+  try:
+    handle = kickoff()
+  except Exception as exc:  # pylint: disable=broad-except
+    elapsed = time.monotonic() - init_start
+    return elapsed, elapsed, exc
+  init_dur = time.monotonic() - init_start
+  try:
+    wait_fn(handle)
+    return init_dur, time.monotonic() - init_start, None
+  except Exception as exc:  # pylint: disable=broad-except
+    return init_dur, time.monotonic() - init_start, exc
+
+
+def _RunAsync(
+    kickoff: Callable,
+    wait_fn: Callable[[str], None],
+    items: list,
+    get_name: Callable[[object], str],
+) -> list[tuple[str, float, float, Exception | None]]:
+  """Fires kickoff(item) concurrently for all items; returns timed results.
+
+  Uses background_tasks.RunThreaded with a concurrency cap for streaming
+  execution — completed ops free their slot immediately for the next one.
+  """
+  if not items:
+    return []
+  results = _Results()
+  cap = min(len(items), _MAX_CONCURRENT.value)
+
+  def DoWrap(item):
+    init_dur, e2e_dur, err = _TimedAsync(lambda: kickoff(item), wait_fn)
+    name = get_name(item)
+    results.add(name, init_dur, e2e_dur, err)
+    logging.info('%s ok=%s initiation=%.2fs end_to_end=%.2fs', name,
+                 err is None, init_dur, e2e_dur)
+
+  background_tasks.RunThreaded(DoWrap, items, max_concurrent_threads=cap)
+  return results.entries
+
+
+def _MakeNodePoolConfig(
+    cluster: kubernetes_cluster.KubernetesCluster,
+    name: str,
+) -> container_lib.BaseNodePoolConfig:
+  """Builds a node-pool config from the cluster's default pool."""
+  cfg = copy.copy(cluster.default_nodepool)
+  cfg.name = name
+  cfg.num_nodes = _NODES_PER_NODEPOOL.value
+  cfg.min_nodes = _NODES_PER_NODEPOOL.value
+  cfg.max_nodes = _NODES_PER_NODEPOOL.value
+  return cfg
+
+
+def _OpSamples(
+    metric_prefix: str,
+    results: list[_OpResult],
+    attempted_ops: int | None = None,
+) -> list[sample.Sample]:
+  """Per-op + aggregate samples for initiation and end-to-end latency.
+
+  Args:
+    metric_prefix: prefix for all metric names.
+    results:       list of (operation_name, init_lat, e2e_lat, err).
+    attempted_ops: total ops originally requested. Used as the denominator
+                   for SuccessRate so EKS-rolled-back pools (which never
+                   appear in results) are counted as failures, not ignored.
+                   If None, len(results) is used (original behavior).
+  """
+  samples: list[sample.Sample] = []
+  init_latencies: list[float] = []
+  e2e_latencies: list[float] = []
+  success = 0
+
+  for r in results:
+    meta = {'operation_name': r.name, 'success': str(r.error is None)}
+    if r.error is not None:
+        meta['error'] = str(r.error)[:200]
+    else:
+        success += 1
+        init_latencies.append(r.init_dur)
+        e2e_latencies.append(r.e2e_dur)
+    samples.append(
+        sample.Sample(f'{metric_prefix}_InitiationLatency', r.init_dur,
+                      'seconds', dict(meta)))
+    samples.append(
+        sample.Sample(f'{metric_prefix}_EndToEndLatency', r.e2e_dur,
+                      'seconds', dict(meta)))
+
+  # ── Success rate ─────────────────────────────────────────────────────────
+  total = attempted_ops if attempted_ops is not None else len(results)
+  executed = len(results)
+  if total > 0:
+    samples.append(
+        sample.Sample(
+            f'{metric_prefix}_SuccessRate',
+            100.0 * success / total,
+            'percent',
+            {
+                'total_ops': str(total),
+                'executed_ops': str(executed),
+                'successful_ops': str(success),
+                'skipped_ops': str(total - executed),
+            },
+        ))
+
+  # ── Aggregate stats (successful ops only) ────────────────────────────────
+  for phase_label, latencies in (
+      ('InitiationLatency', init_latencies),
+      ('EndToEndLatency', e2e_latencies),
+  ):
+    if len(latencies) >= 2:
+      samples += _AggregateSamples(metric_prefix, phase_label, latencies)
+    if len(latencies) >= 4:
+      samples += _OutlierSamples(metric_prefix, phase_label, latencies)
+
+  return samples
+
+
+def _AggregateSamples(metric_prefix: str, phase_label: str,
+                      latencies: list[float]) -> list[sample.Sample]:
+  """Emits Mean/StdDev/Min/Median/P90/P99/Max samples for a latency series."""
+  n = len(latencies)
+  meta = {'sample_count': str(n)}
+
+  # statistics.quantiles with method='inclusive' matches linear interpolation
+  # and returns n-1 cut points; index 89→P90, 98→P99.
+  quantiles = statistics.quantiles(latencies, n=100, method='inclusive')
+
+  stats = [
+    ('Mean', statistics.mean(latencies)),
+    ('StdDev', statistics.pstdev(latencies)),
+    ('Min', min(latencies)),
+    ('Median', statistics.median(latencies)),
+    ('P90', quantiles[89]),
+    ('P99', quantiles[98]),
+    ('Max', max(latencies)),
+  ]
+  result = []
+  for label, value in stats:
+    result.append(
+        sample.Sample(
+            f'{metric_prefix}_{phase_label}_{label}',
+            value,
+            'seconds',
+            dict(meta),
+        ))
+  return result
+
+
+def _OutlierSamples(metric_prefix: str, phase_label: str,
+                    latencies: list[float]) -> list[sample.Sample]:
+  """Emits a single OutlierCount sample using IQR-fence outlier detection."""
+  # statistics.quantiles(n=4) returns [Q1, Q2, Q3]; indices 0 and 2.
+  quartiles = statistics.quantiles(latencies, n=4, method='inclusive')
+  q1, q3 = quartiles[0], quartiles[2]
+  iqr = q3 - q1
+  lower_fence = q1 - 1.5 * iqr
+  upper_fence = q3 + 1.5 * iqr
+  outlier_count = sum(
+      1 for v in latencies if v < lower_fence or v > upper_fence
+  )
+  meta = {
+      'q1': str(q1),
+      'q3': str(q3),
+      'iqr': str(iqr),
+      'upper_fence': str(upper_fence),
+      'lower_fence': str(lower_fence),
+      'sample_count': str(len(latencies)),
+  }
+  return [
+      sample.Sample(
+          f'{metric_prefix}_{phase_label}_OutlierCount',
+          outlier_count,
+          'count',
+          meta,
+      )
+  ]
diff --git a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
index 15b9adb5a8..c02b820ee7 100644
--- a/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
+++ b/perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
@@ -26,6 +26,7 @@
 import logging
 import math
 import re
+import threading
 from typing import Any
 from urllib import parse
 
@@ -45,6 +46,13 @@
 from perfkitbenchmarker.resources.container_service import kubernetes_cluster
 from perfkitbenchmarker.resources.container_service import kubernetes_commands
 
+# Flag to skip EBS CSI driver setup during cluster creation.
+# The kubernetes_management benchmark does not use persistent volumes, so
+# EBS CSI setup (OIDC + IAM role + addon install) is unnecessary and adds
+# ~3 minutes to every run. Set to True to skip it and save time.
+# Defined before FLAGS = flags.FLAGS so it is registered at import time
+# and visible to PKB's flag parser before --cloud/--container_cluster_type
+# are resolved.
 FLAGS = flags.FLAGS
 # GPU types which practically require spot to get.
 _RARE_GPU_TYPES = [
@@ -54,7 +62,7 @@
 ]
 
 
-def RecursivelyUpdateDictionary(
+def _recursively_update_dictionary(
     original: dict[str, Any], updates: dict[str, Any]
 ) -> dict[str, Any]:
   """Updates a nested dictionary.
@@ -72,14 +80,14 @@ def RecursivelyUpdateDictionary(
   # Copied from https://stackoverflow.com/questions/3232943
   for k, v in updates.items():
     if isinstance(v, abc.Mapping):
-      original[k] = RecursivelyUpdateDictionary(original.get(k, {}), v)
+      original[k] = _recursively_update_dictionary(original.get(k, {}), v)
     else:
       original[k] = v
   return original
 
 
 class BaseEksCluster(kubernetes_cluster.KubernetesCluster):
-  """Shared base class for Elastic Kubernetes Service cluster auto mode & not."""
+  """Shared base class for EKS cluster (auto mode and standard)."""
 
   def __init__(self, spec):
     # EKS requires a region and optionally a list of one or zones.
@@ -107,6 +115,9 @@ def __init__(self, spec):
     self.account: str = util.GetAccount()
     self.node_to_nodepool: dict[str, container.BaseNodePoolConfig | None] = {}
     self.node_to_machine_type: dict[str, str | None] = {}
+    self._cached_subnets: list[str] | None = None
+    self._cached_subnets_per_az: dict[str, str] | None = None
+    self._cached_node_role_arn: str | None = None
 
   def _ChooseSecondZone(self):
     """Choose a second zone for the control plane if only one is specified."""
@@ -118,23 +129,30 @@ def _ChooseSecondZone(self):
           self.region + ('b' if self.zone.endswith('a') else 'a')
       )
 
-  def _CreateDependencies(self):
-    """Set up the ssh key."""
-    aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region)
 
-  def _DeleteDependencies(self):
-    """Delete the ssh key."""
-    aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region)
+
+
 
   def _EksCtlCreate(self, create_json: dict[str, Any]):
     """Creates the EKS cluster."""
-    # If multiple zones are passed use them for the control plane.
-    # Otherwise EKS will auto-select control plane zones in the region.
-    if self.control_plane_zones:
-      create_json['availabilityZones'] = self.control_plane_zones
+    # Pass all control_plane_zones to the cluster so eksctl creates VPC subnets
+    # in every requested AZ. Without this, eksctl may only create subnets in 2
+    # AZs even when 3 are requested, preventing round-robin nodegroup placement.
+    # This is critical for distributing nodegroups across AZs to avoid per-AZ
+    # EC2 capacity limits.
+    # availabilityZones is already set in create_json by _CreateDependencies
+    # via the EC2 AZ query (bypassing PKB zone flag truncation).
+    # Log it here for visibility.
+    if 'availabilityZones' in create_json:
+      logging.info(
+          '[EKS] Creating cluster with AZs: %s — '
+          + 'eksctl will auto-assign CIDRs for all %d zones.',
+          create_json['availabilityZones'],
+          len(create_json['availabilityZones']),
+      )
     # Schema for the cluster create command is here:
     # https://schema.eksctl.io/
-    create_json = RecursivelyUpdateDictionary(
+    create_json = _recursively_update_dictionary(
         {
             'apiVersion': 'eksctl.io/v1alpha5',
             'kind': 'ClusterConfig',
@@ -185,6 +203,11 @@ def _RenderNodeGroupJson(
     if nodepool.min_nodes != nodepool.max_nodes:
       group_json['minSize'] = nodepool.min_nodes
       group_json['maxSize'] = nodepool.max_nodes
+    # Pin the default nodegroup to control_plane_zones[0] so it stays in a
+    # single known AZ. The benchmark nodegroups (pkbma*, pkbmc*) are placed
+    # via CreateNodePoolAsync using the round-robin _DiscoverSubnetsPerAZ logic.
+    if self.control_plane_zones:
+      group_json['availabilityZones'] = [self.control_plane_zones[0]]
     return group_json
 
   def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str:
@@ -211,6 +234,34 @@ def _WriteJsonToFile(self, json_dict: dict[str, Any]) -> str:
 
   def _Delete(self):
     """Deletes the control plane and worker nodes."""
+    # Clean up SSH key pair — safety net in case _DeleteDependencies didn't run
+    try:
+      aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region)
+    except Exception:  # pylint: disable=broad-except
+      pass
+    # Clean up dynamically created launch templates and capacity reservations
+    # Only runs if capacity reservations were actually created this run.
+    if getattr(FLAGS, 'eks_reserve_capacity_per_az', False):
+      for az in getattr(self, '_capacity_reservation_ids', {}).keys():
+        vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'ec2', 'delete-launch-template',
+                '--launch-template-name', f'pkb-eks-lt-{az}',
+                '--region', self.region,
+            ],
+            raise_on_failure=False,
+        )
+        logging.info('[EKS] Deleted launch template pkb-eks-lt-%s', az)
+      for az, res_id in getattr(self, '_capacity_reservation_ids', {}).items():
+        vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'ec2', 'cancel-capacity-reservation',
+                '--capacity-reservation-id', res_id,
+                '--region', self.region,
+            ],
+            raise_on_failure=False,
+        )
+        logging.info('[EKS] Cancelled capacity reservation %s in %s', res_id, az)
     super()._Delete()
     cmd = [
         FLAGS.eksctl,
@@ -392,60 +443,265 @@ def GetResourceMetadata(self):
 
   def _Create(self):
     """Creates the control plane and worker nodes."""
+    # Import SSH key pair to EC2 before cluster creation — eksctl requires it.
+    aws_virtual_machine.AwsKeyFileManager.ImportKeyfile(self.region)
     nodepool_jsons = [self._RenderNodeGroupJson(self.default_nodepool)]
     for _, node_group in self.nodepools.items():
       nodepool_jsons += [self._RenderNodeGroupJson(node_group)]
     create_json: dict[str, Any] = {
         'managedNodeGroups': nodepool_jsons,
-        'vpc': {
-            'nat': {'gateway': 'Disable'},
-        },
+        'vpc': {'nat': {'gateway': 'Disable'}},
     }
+    # Explicitly set cluster-level availabilityZones so eksctl creates VPC
+    # public+private subnets in ALL AZs in the region.
+    # IMPORTANT: PKB's deprecated --zones flag gets truncated by its own
+    # translation layer to 2 AZs even when 3 are specified. We bypass this
+    # by querying EC2 directly for all available AZs in the region and
+    # passing all of them to eksctl. This ensures the VPC gets subnets in
+    # all AZs, enabling proper round-robin nodegroup placement.
+    try:
+      az_out, _, az_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'ec2', 'describe-availability-zones',
+              '--region', self.region,
+              '--filters', 'Name=state,Values=available',
+              '--query', 'AvailabilityZones[*].ZoneName',
+              '--output', 'json',
+          ],
+          raise_on_failure=False,
+      )
+      if az_rc == 0 and az_out.strip():
+        all_azs = json.loads(az_out.strip())
+        # Limit to 3 AZs maximum to avoid excessive subnet creation
+        cluster_azs = sorted(all_azs)[:3]
+      else:
+        # Fallback: use control_plane_zones or default to known us-east-1 AZs
+        cluster_azs = (
+            self.control_plane_zones
+            if self.control_plane_zones
+            else [f'{self.region}a', f'{self.region}b', f'{self.region}c']
+        )
+    except Exception:  # pylint: disable=broad-except
+      cluster_azs = (
+          self.control_plane_zones
+          if self.control_plane_zones
+          else [f'{self.region}a', f'{self.region}b', f'{self.region}c']
+      )
+
+    create_json['availabilityZones'] = cluster_azs
+    logging.info(
+        '[EKS] Cluster will have subnets in %d AZs: %s '
+        + '(queried from EC2, bypassing PKB zone flag truncation)',
+        len(cluster_azs), cluster_azs,
+    )
     self._EksCtlCreate(create_json)
 
+    # Dynamically create capacity reservations + launch templates AFTER cluster
+    # creation so cluster CA and endpoint are available for node bootstrap.
+    # Gate capacity reservations behind flag — disabled by default
+    # to avoid impacting other EKS benchmarks (kubernetes_nginx etc)
+    # that use different instance types and do not need reservations.
+    if not FLAGS.eks_reserve_capacity_per_az:
+      self._capacity_reservation_ids = {}
+      logging.info(
+          '[EKS] Skipping capacity reservations '
+          '(--eks_reserve_capacity_per_az=False)'
+      )
+    else:
+      self._capacity_reservation_ids = {}
+      # Reserve enough capacity per AZ for 100 pools:
+      # ~67 pools per AZ × 2 nodes = 134 instances max per AZ (Scenario A)
+      # Plus default nodegroup (2) + buffer = 80 minimum for 10 pools, 150 for 100 pools
+      concurrent = getattr(FLAGS, 'k8s_mgmt_concurrent_nodepools', 10)
+      nodes_per_az = max(80, concurrent * 2 + 20)
+      # Fetch cluster CA and endpoint for bootstrap user data
+      import json as _json
+      cluster_out, _, cluster_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'eks', 'describe-cluster',
+              '--name', self.name,
+              '--region', self.region,
+              '--query', 'cluster.{endpoint:endpoint,ca:certificateAuthority.data,cidr:kubernetesNetworkConfig.serviceIpv4Cidr}',
+              '--output', 'json',
+          ],
+          raise_on_failure=False,
+      )
+      cluster_ca = ''
+      cluster_endpoint = ''
+      cluster_service_cidr = '10.100.0.0/16'  # default fallback
+      if cluster_rc == 0 and cluster_out.strip():
+        cluster_info = _json.loads(cluster_out.strip())
+        cluster_ca = cluster_info.get('ca', '')
+        cluster_endpoint = cluster_info.get('endpoint', '')
+        cluster_service_cidr = cluster_info.get('cidr', '10.100.0.0/16')
+        logging.info('[EKS] Fetched cluster endpoint=%s cidr=%s for bootstrap',
+                     cluster_endpoint, cluster_service_cidr)
+
+      # Query EKS-optimized AMI once for all AZs
+      # cluster_version may be None if not explicitly set — fetch from cluster
+      if not self.cluster_version:
+        ver_out, _, ver_rc = vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'eks', 'describe-cluster',
+                '--name', self.name,
+                '--region', self.region,
+                '--query', 'cluster.version',
+                '--output', 'text',
+            ],
+            raise_on_failure=False,
+        )
+        if ver_rc != 0 or not ver_out.strip():
+          raise errors.Resource.CreationError(
+              '[EKS] Failed to determine cluster version from describe-cluster. '
+              'Cannot proceed without a valid Kubernetes version. '
+              f'rc={ver_rc} out={ver_out.strip()!r}'
+          )
+        self.cluster_version = ver_out.strip()
+        logging.info('[EKS] Resolved cluster version: %s', self.cluster_version)
+      k8s_minor_str = '.'.join(self.cluster_version.split('.')[:2])
+      ami_out, _, ami_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'ssm', 'get-parameter',
+              '--name', (
+                  f'/aws/service/eks/optimized-ami/{k8s_minor_str}/'
+                  'amazon-linux-2023/x86_64/standard/recommended/image_id'
+              ),
+              '--region', self.region,
+              '--query', 'Parameter.Value',
+              '--output', 'text',
+          ],
+          raise_on_failure=False,
+      )
+      ami_id = ami_out.strip() if ami_rc == 0 and ami_out.strip() else ''
+      logging.info('[EKS] EKS AMI for K8s %s: %s', k8s_minor_str, ami_id)
+
+      for az in cluster_azs:
+        logging.info('[EKS] Creating capacity reservation in %s (%d instances)...', az, nodes_per_az)
+        cap_out, _, cap_rc = vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'ec2', 'create-capacity-reservation',
+                '--instance-type', 't3.medium',
+                '--instance-platform', 'Linux/UNIX',
+                '--availability-zone', az,
+                '--instance-count', str(nodes_per_az),
+                '--region', self.region,
+                '--query', 'CapacityReservation.CapacityReservationId',
+                '--output', 'text',
+            ],
+            raise_on_failure=False,
+        )
+        if cap_rc == 0 and cap_out.strip() and cap_out.strip() != 'None':
+          res_id = cap_out.strip()
+          self._capacity_reservation_ids[az] = res_id
+          logging.info('[EKS] Created capacity reservation %s in %s', res_id, az)
+          if ami_id and cluster_ca and cluster_endpoint:
+            import base64 as _b64
+            # AL2023 uses nodeadm YAML config — NOT the old bootstrap.sh
+            nodeadm_config = (
+                'apiVersion: node.eks.aws/v1alpha1' + chr(10) +
+                'kind: NodeConfig' + chr(10) +
+                'spec:' + chr(10) +
+                '  cluster:' + chr(10) +
+                f'    name: {self.name}' + chr(10) +
+                f'    apiServerEndpoint: {cluster_endpoint}' + chr(10) +
+                f'    certificateAuthority: {cluster_ca}' + chr(10) +
+                f'    cidr: {cluster_service_cidr}'
+            )
+            user_data = _b64.b64encode(('MIME-Version: 1.0' + chr(10) +
+                'Content-Type: multipart/mixed; boundary="==BOUNDARY=="' + chr(10) +
+                chr(10) +
+                '--==BOUNDARY==' + chr(10) +
+                'Content-Type: application/node.eks.aws' + chr(10) +
+                chr(10) +
+                nodeadm_config + chr(10) +
+                '--==BOUNDARY==--').encode()).decode()
+            logging.info('[EKS] Using AL2023 nodeadm bootstrap for %s', az)
+            lt_data = (
+                '{'
+                f'"ImageId":"{ami_id}",'
+                '"CapacityReservationSpecification":{'
+                '"CapacityReservationPreference":"capacity-reservations-only",'
+                f'"CapacityReservationTarget":{{"CapacityReservationId":"{res_id}"}}}},'
+                f'"UserData":"{user_data}"'
+                '}'
+            )
+            _, _, lt_rc = vm_util.IssueCommand(
+                util.AWS_PREFIX + [
+                    'ec2', 'create-launch-template',
+                    '--region', self.region,
+                    '--launch-template-name', f'pkb-eks-lt-{az}',
+                    '--launch-template-data', lt_data,
+                ],
+                raise_on_failure=False,
+            )
+            if lt_rc == 0:
+              logging.info(
+                  '[EKS] Created launch template pkb-eks-lt-%s (AMI=%s) -> %s',
+                  az, ami_id, res_id,
+              )
+            else:
+              logging.warning('[EKS] Failed to create launch template for %s', az)
+          else:
+            logging.warning('[EKS] Missing AMI/CA/endpoint — no launch template for %s', az)
+        else:
+          logging.warning('[EKS] Failed to create capacity reservation in %s — on-demand', az)
+
     # Above create command passes "withOidc=true", but it doesn't seem to work &
     # therefore this command is needed.
-    cmd = [
-        FLAGS.eksctl,
-        'utils',
-        'associate-iam-oidc-provider',
-        f'--cluster={self.name}',
-        f'--region={self.region}',
-        '--approve',
-    ]
-    vm_util.IssueCommand(cmd)
+    if not FLAGS.eks_skip_ebs_csi:
+      cmd = [
+          FLAGS.eksctl,
+          'utils',
+          'associate-iam-oidc-provider',
+          f'--cluster={self.name}',
+          f'--region={self.region}',
+          '--approve',
+      ]
+      vm_util.IssueCommand(cmd)
 
     # EBS CSI driver is required for creating EBS volumes in version > 1.23
     # https://docs.aws.amazon.com/eks/latest/userguide/ebs-csi.html
+    # Skip if --eks_skip_ebs_csi is set (saves ~3 min for benchmarks that
+    # do not use persistent volumes, such as kubernetes_management).
+    if FLAGS.eks_skip_ebs_csi:
+      logging.info(
+          '[EKS] Skipping EBS CSI driver setup (--eks_skip_ebs_csi=True). '
+          + 'Saves ~3 min. Set to False if benchmark needs persistent volumes.'
+      )
+    else:
+      # Name must be unique.
+      ebs_csi_driver_role = f'AmazonEKS_EBS_CSI_DriverRole_{self.name}'
+
+      ebs_policy_arn = (
+          'arn:aws:iam::aws:policy/service-role/'
+          + 'AmazonEBSCSIDriverPolicy')
+      cmd = [
+          FLAGS.eksctl,
+          'create',
+          'iamserviceaccount',
+          '--name=ebs-csi-controller-sa',
+          '--namespace=kube-system',
+          f'--region={self.region}',
+          f'--cluster={self.name}',
+          f'--attach-policy-arn={ebs_policy_arn}',
+          '--approve',
+          '--role-only',
+          f'--role-name={ebs_csi_driver_role}',
+      ]
+      vm_util.IssueCommand(cmd)
 
-    # Name must be unique.
-    ebs_csi_driver_role = f'AmazonEKS_EBS_CSI_DriverRole_{self.name}'
-
-    cmd = [
-        FLAGS.eksctl,
-        'create',
-        'iamserviceaccount',
-        '--name=ebs-csi-controller-sa',
-        '--namespace=kube-system',
-        f'--region={self.region}',
-        f'--cluster={self.name}',
-        '--attach-policy-arn=arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy',
-        '--approve',
-        '--role-only',
-        f'--role-name={ebs_csi_driver_role}',
-    ]
-    vm_util.IssueCommand(cmd)
-
-    cmd = [
-        FLAGS.eksctl,
-        'create',
-        'addon',
-        '--name=aws-ebs-csi-driver',
-        f'--region={self.region}',
-        f'--cluster={self.name}',
-        f'--service-account-role-arn=arn:aws:iam::{self.account}:role/{ebs_csi_driver_role}',
-    ]
-    vm_util.IssueCommand(cmd)
+      svc_acct_arn = (
+          f'arn:aws:iam::{self.account}:role/{ebs_csi_driver_role}')
+      cmd = [
+          FLAGS.eksctl,
+          'create',
+          'addon',
+          '--name=aws-ebs-csi-driver',
+          f'--region={self.region}',
+          f'--cluster={self.name}',
+          f'--service-account-role-arn={svc_acct_arn}',
+      ]
+      vm_util.IssueCommand(cmd)
 
     if aws_flags.AWS_EKS_POD_IDENTITY_ROLE.value:
       cmd = util.AWS_PREFIX + [
@@ -526,6 +782,703 @@ def ResizeNodePool(
     ]
     vm_util.IssueCommand(cmd)
 
+  def CreateNodePool(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> None:
+    """Creates a single managed node group on the cluster."""
+    ng_json = self._RenderNodeGroupJson(nodepool_config)
+    if node_version:
+      ng_json['version'] = node_version
+    config_json = {
+        'apiVersion': 'eksctl.io/v1alpha5',
+        'kind': 'ClusterConfig',
+        'metadata': {
+            'name': self.name,
+            'region': self.region,
+        },
+        'managedNodeGroups': [ng_json],
+    }
+    filename = self._WriteJsonToFile(config_json)
+    cmd = [
+        FLAGS.eksctl,
+        'create',
+        'nodegroup',
+        f'--config-file={filename}',
+    ]
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=1800, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+
+  def DeleteNodePool(self, name: str) -> None:
+    """Deletes the named node group."""
+    cmd = [
+        FLAGS.eksctl,
+        'delete',
+        'nodegroup',
+        f'--name={name}',
+        f'--cluster={self.name}',
+        f'--region={self.region}',
+        '--wait',
+    ]
+    vm_util.IssueCommand(cmd, timeout=1800)
+
+  def UpgradeNodePool(self, name: str, target_version: str) -> None:
+    """Upgrades the named node group to target_version."""
+    cmd = [
+        FLAGS.eksctl,
+        'upgrade',
+        'nodegroup',
+        f'--name={name}',
+        f'--cluster={self.name}',
+        f'--region={self.region}',
+        f'--kubernetes-version={target_version}',
+        '--wait',
+    ]
+    vm_util.IssueCommand(cmd, timeout=1800)
+
+  # ---- Async variants (return opaque handles) -------------------------------
+
+  def _DiscoverSubnets(self) -> list[str]:
+    """Returns the EKS cluster's subnet IDs (cached after first call)."""
+    if getattr(self, '_cached_subnets', None):
+      return self._cached_subnets
+    out, _, _ = vm_util.IssueCommand(
+        util.AWS_PREFIX
+        + [
+            'eks',
+            'describe-cluster',
+            '--name',
+            self.name,
+            '--region',
+            self.region,
+        ]
+    )
+    info = json.loads(out)
+    self._cached_subnets = info['cluster']['resourcesVpcConfig']['subnetIds']
+    return self._cached_subnets
+
+  def _DiscoverSubnetsPerAZ(self) -> dict[str, str]:
+    """Returns a mapping of {AZ: subnet_id} for the cluster's subnets.
+
+    Used by CreateNodePoolAsync to distribute nodegroups round-robin across
+    AZs, avoiding per-AZ EC2 capacity limits when creating many pools.
+    Only returns AZs that are in control_plane_zones (if specified).
+    Cached after first call.
+    """
+    if getattr(self, '_cached_subnets_per_az', None) is not None:
+      return self._cached_subnets_per_az
+
+    subnet_ids = self._DiscoverSubnets()
+    if not subnet_ids:
+      self._cached_subnets_per_az = {}
+      return {}
+
+    # Describe subnets to get their AZ mapping
+    out, _, rc = vm_util.IssueCommand(
+        util.AWS_PREFIX + [
+            'ec2', 'describe-subnets',
+            '--region', self.region,
+            '--subnet-ids', *subnet_ids,
+            '--query', 'Subnets[*].{SubnetId:SubnetId,AZ:AvailabilityZone,Public:MapPublicIpOnLaunch}',
+            '--output', 'json',
+        ],
+        raise_on_failure=False,
+    )
+    if rc:
+      logging.warning(
+          '[EKS] Could not describe subnets for AZ mapping — '
+          + 'falling back to all subnets'
+      )
+      self._cached_subnets_per_az = {}
+      return {}
+
+    subnets = json.loads(out)
+
+    # Do NOT filter by control_plane_zones — PKB truncates it to 2 AZs.
+    # Accept all subnets the VPC has across all AZs.
+    # Build AZ map — always prefer public subnets (MapPublicIpOnLaunch=True)
+    # which have an internet gateway route. Private subnets lack IGW routes
+    # and nodes launched there cannot reach the EKS API server to join.
+    az_map: dict[str, str] = {}
+    az_map_private: dict[str, str] = {}
+    for s in subnets:
+      az = s['AZ']
+      if s.get('Public'):
+        az_map[az] = s['SubnetId']
+        logging.info('[EKS] AZ %s → public subnet %s', az, s['SubnetId'])
+      elif az not in az_map:
+        az_map_private[az] = s['SubnetId']
+    for az, sid in az_map_private.items():
+      if az not in az_map:
+        logging.warning('[EKS] AZ %s has no public subnet — using private %s', az, sid)
+        az_map[az] = sid
+
+    logging.info(
+        '[EKS] Subnet-per-AZ mapping: %s (from %d total subnets)',
+        az_map, len(subnet_ids),
+    )
+    self._cached_subnets_per_az = az_map
+    return az_map
+
+  def _ResolveReleaseVersion(self, minor: str) -> str:
+    """Returns the EKS-optimized AMI release version (e.g. '1.33.10-20260124').
+
+    Used to populate `releaseVersion` in the create-nodegroup payload so the
+    benchmark can pin specific K8s minors. Thread-safe: at scale we have N
+    workers all asking for the same minor; only the first does the SSM
+    lookup, the rest read from the cache.
+    """
+    if getattr(self, '_release_version_lock', None) is None:
+      self._release_version_lock = threading.Lock()
+    with self._release_version_lock:
+      cache = getattr(self, '_cached_release_versions', None) or {}
+      if minor in cache:
+        return cache[minor]
+      cmd = util.AWS_PREFIX + [
+          'ssm',
+          'get-parameter',
+          '--name',
+          (
+              f'/aws/service/eks/optimized-ami/{minor}/amazon-linux-2023/'
+              'x86_64/standard/recommended/release_version'
+          ),
+          '--region',
+          self.region,
+          '--query',
+          'Parameter.Value',
+          '--output',
+          'text',
+      ]
+      out, err, rc = vm_util.IssueCommand(cmd, raise_on_failure=False)
+      if rc:
+        raise errors.Resource.CreationError(
+            f'Failed to resolve EKS release version for minor {minor!r}: {err}'
+        )
+      cache[minor] = out.strip()
+      self._cached_release_versions = cache
+      return cache[minor]
+
+  def _DiscoverNodeRoleArn(self) -> str:
+    """Returns a node IAM role ARN by inspecting an existing nodegroup."""
+    if getattr(self, '_cached_node_role_arn', None):
+      return self._cached_node_role_arn
+    out, _, _ = vm_util.IssueCommand(
+        util.AWS_PREFIX
+        + [
+            'eks',
+            'list-nodegroups',
+            '--cluster-name',
+            self.name,
+            '--region',
+            self.region,
+        ]
+    )
+    for ng_name in json.loads(out).get('nodegroups', []):
+      ng_out, _, _ = vm_util.IssueCommand(
+          util.AWS_PREFIX
+          + [
+              'eks',
+              'describe-nodegroup',
+              '--cluster-name',
+              self.name,
+              '--nodegroup-name',
+              ng_name,
+              '--region',
+              self.region,
+          ]
+      )
+      role = json.loads(ng_out)['nodegroup'].get('nodeRole')
+      if role:
+        self._cached_node_role_arn = role
+        return role
+    raise errors.Resource.CreationError(
+        f'No existing nodegroup found to discover node role for '
+        f'cluster {self.name}.'
+    )
+
+  def CreateNodePoolAsync(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> str:
+    # Pass the full request via --cli-input-json so that we can specify both
+    # `version` (e.g. "1.33") and `releaseVersion` (e.g. "1.33.11-...") in
+    # the same call. Two reasons this matters:
+    #   1. AWS CLI v1 has a bug where the top-level --version flag swallows
+    #      the subcommand --version, printing the CLI banner and exiting.
+    #      cli-input-json sidesteps CLI argument parsing entirely.
+    #   2. EKS rejects a releaseVersion that doesn't match the request's
+    #      `version`; if `version` is omitted EKS defaults it to the
+    #      cluster's version, which (for the N-1 -> N benchmark path)
+    #      produces a "release version X is not valid for kubernetes
+    #      version Y" error.
+
+    # ── AZ distribution ────────────────────────────────────────────────────
+    # When multiple zones are specified (e.g. us-east-1a,1b,1c), distribute
+    # nodegroups round-robin across AZs to avoid per-AZ EC2 capacity limits.
+    # Without this, EKS places all nodegroups in a single AZ causing timeouts.
+    # Pool name format: pkbma000, pkbma001, ... — extract index from suffix.
+    az_subnets = self._DiscoverSubnetsPerAZ()
+    if az_subnets and len(az_subnets) > 1:
+      # Extract numeric suffix from pool name to determine AZ assignment
+      name = nodepool_config.name
+      suffix = ''.join(c for c in name if c.isdigit())
+      # pkbmb (Scenario B) has no suffix — assign to us-east-1b (idx=1)
+      # to avoid competing with us-east-1a which has the default nodegroup.
+      idx = int(suffix) if suffix else 1
+      zones = sorted(az_subnets.keys())
+      assigned_az = zones[idx % len(zones)]
+      subnets = [az_subnets[assigned_az]]
+      logging.info(
+          '[EKS] CreateNodePool %s -> AZ=%s subnet=%s (round-robin idx=%d)',
+          name, assigned_az, subnets[0], idx,
+      )
+    else:
+      subnets = self._DiscoverSubnets()
+      logging.info('[EKS] CreateNodePool %s -> using all subnets (single AZ)',
+                   nodepool_config.name)
+
+    payload: dict[str, Any] = {
+        'clusterName': self.name,
+        'nodegroupName': nodepool_config.name,
+        'scalingConfig': {
+            'minSize': nodepool_config.num_nodes,
+            'maxSize': nodepool_config.num_nodes,
+            'desiredSize': nodepool_config.num_nodes,
+        },
+        'subnets': subnets,
+        'instanceTypes': [nodepool_config.machine_type],
+        'amiType': 'AL2023_x86_64_STANDARD',
+        'nodeRole': self._DiscoverNodeRoleArn(),
+        'labels': {'pkb_nodepool': nodepool_config.name},
+        'tags': util.MakeDefaultTags(),
+        # Target open capacity reservations first before falling back to
+        # regular on-demand. Ensures EC2 capacity reservations created
+        # before the benchmark are actually used by EKS nodegroups.
+        'capacityReservationSpecification': {
+            'capacityReservationPreference': 'open',
+        },
+    }
+    _az = assigned_az if az_subnets and len(az_subnets) > 1 else f'{self.region}a'
+    # Only look up launch templates and capacity reservations when
+    # --eks_reserve_capacity_per_az=true. Other benchmarks skip this entirely.
+    if FLAGS.eks_reserve_capacity_per_az:
+      _lt_name = f'pkb-eks-lt-{_az}'
+      _lt_out, _, _lt_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'ec2', 'describe-launch-templates',
+              '--region', self.region,
+              '--filters', f'Name=launch-template-name,Values={_lt_name}',
+              '--query', 'LaunchTemplates[0].LaunchTemplateId',
+              '--output', 'text',
+          ],
+          raise_on_failure=False,
+      )
+      res_id = self._capacity_reservation_ids.get(_az, '')
+      if res_id and _lt_rc == 0 and _lt_out.strip() and _lt_out.strip() not in ('None', 'null', ''):
+        payload['launchTemplate'] = {'id': _lt_out.strip(), 'version': '$Latest'}
+        # When launch template specifies an ImageId, EKS rejects these fields:
+        # - releaseVersion: conflicts with AMI
+        # - instanceTypes:  must come from launch template only
+        # - amiType:        conflicts with AMI
+        payload.pop('releaseVersion', None)
+        payload.pop('instanceTypes', None)
+        payload.pop('amiType', None)
+        logging.info(
+            '[EKS] Nodegroup %s using launch template %s targeting reservation %s in AZ %s',
+            nodepool_config.name, _lt_name, res_id, _az,
+        )
+      else:
+        logging.warning('[EKS] No reservation/template for AZ %s — using on-demand', _az)
+
+    if node_version:
+      # EKS rejects both 'version' and 'releaseVersion' when a launch template
+      # with ImageId is specified — skip both when launchTemplate is in use.
+      if 'launchTemplate' not in payload:
+        payload['version'] = node_version
+        payload['releaseVersion'] = self._ResolveReleaseVersion(node_version)
+    filename = self._WriteJsonToFile(payload)
+    cmd = util.AWS_PREFIX + [
+        'eks',
+        'create-nodegroup',
+        '--region',
+        self.region,
+        '--cli-input-json',
+        f'file://{filename}',
+    ]
+    # Retry on EC2 RunInstances throttling at high concurrency (99 pools).
+    max_retries = 5
+    base_delay = 10
+    for attempt in range(max_retries):
+      _, stderr, retcode = vm_util.IssueCommand(
+          cmd, timeout=300, raise_on_failure=False
+      )
+      if retcode == 0:
+        break
+      if 'Request limit exceeded' in stderr or 'ThrottlingException' in stderr:
+        if attempt < max_retries - 1:
+          delay = base_delay * (2 ** attempt)
+          logging.warning(
+              '[EKS] CreateNodegroup %s throttled — retry %d/%d in %ds',
+              nodepool_config.name, attempt + 1, max_retries, delay,
+          )
+          time.sleep(delay)
+          continue
+      raise errors.Resource.CreationError(stderr)
+    else:
+      raise errors.Resource.CreationError(
+          f'CreateNodegroup {nodepool_config.name} failed after retries: {stderr}'
+      )
+    return f'ng_active:{nodepool_config.name}'
+
+  def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str:
+    # For Custom AMI nodegroups (using launch template with ImageId),
+    # EKS requires the launch template to be passed on upgrade.
+    # Determine the AZ for this nodegroup to find the correct launch template.
+    suffix = ''.join(c for c in name if c.isdigit())
+    # pkbmb (Scenario B) has no suffix — use idx=1 (us-east-1b) to avoid
+    # competing with us-east-1a which already has the default nodegroup
+    idx = int(suffix) if suffix else 1
+    az_subnets = self._DiscoverSubnetsPerAZ()
+    if az_subnets and len(az_subnets) > 1:
+      zones = sorted(az_subnets.keys())
+      _az = zones[idx % len(zones)]
+    else:
+      _az = f'{self.region}a'
+    # Only look up launch template when capacity reservations are enabled.
+    # For other benchmarks, always use standard kubernetes-version upgrade.
+    lt_id = ''
+    _lt_name = ''
+    if FLAGS.eks_reserve_capacity_per_az:
+      _lt_name = f'pkb-eks-lt-{_az}'
+      lt_out, _, lt_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'ec2', 'describe-launch-templates',
+              '--region', self.region,
+              '--filters', f'Name=launch-template-name,Values={_lt_name}',
+              '--query', 'LaunchTemplates[0].LaunchTemplateId',
+              '--output', 'text',
+          ],
+          raise_on_failure=False,
+      )
+      lt_id = lt_out.strip() if lt_rc == 0 and lt_out.strip() not in ('', 'None', 'null') else ''
+
+    # Custom AMI nodegroups cannot use --kubernetes-version — use launch template only
+    if lt_id:
+      cmd = util.AWS_PREFIX + [
+          'eks', 'update-nodegroup-version',
+          '--cluster-name', self.name,
+          '--nodegroup-name', name,
+          '--region', self.region,
+          '--launch-template', f'id={lt_id},version=$Latest',
+      ]
+      logging.info('[EKS] Upgrading %s with launch template %s in AZ %s',
+                   name, _lt_name, _az)
+    else:
+      cmd = util.AWS_PREFIX + [
+          'eks', 'update-nodegroup-version',
+          '--cluster-name', self.name,
+          '--nodegroup-name', name,
+          '--region', self.region,
+          '--kubernetes-version', target_version,
+      ]
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=300, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+    return f'ng_active:{name}'
+
+  def DeleteNodePoolAsync(self, name: str) -> str:
+    cmd = util.AWS_PREFIX + [
+        'eks',
+        'delete-nodegroup',
+        '--cluster-name',
+        self.name,
+        '--nodegroup-name',
+        name,
+        '--region',
+        self.region,
+    ]
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=300, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+    return f'ng_gone:{name}'
+
+  def UpdateClusterAsync(self) -> str:
+    """Fires a CloudWatch logging toggle; returns handle 'cluster_update:<id>'.
+
+    Returns a handle carrying the specific update id so WaitForOperation
+    can poll *that* update's status (Successful / Failed) rather than the
+    cluster's top-level status (which stays ACTIVE during config updates,
+    making the wait return instantly and silently mis-reporting latency).
+    """
+    log_types = ['api', 'audit', 'authenticator', 'controllerManager',
+                 'scheduler']
+    describe = util.AWS_PREFIX + [
+        'eks',
+        'describe-cluster',
+        '--name',
+        self.name,
+        '--region',
+        self.region,
+    ]
+    out, _, _ = vm_util.IssueCommand(describe)
+    current = (
+        json.loads(out)['cluster'].get('logging', {}).get('clusterLogging', [])
+    )
+    any_enabled = any(e.get('enabled', False) for e in current)
+    payload = json.dumps({
+        'clusterLogging': [
+            {'types': log_types, 'enabled': not any_enabled}
+        ]
+    })
+    upd = util.AWS_PREFIX + [
+        'eks',
+        'update-cluster-config',
+        '--name',
+        self.name,
+        '--region',
+        self.region,
+        '--logging',
+        payload,
+    ]
+    # Wait for cluster ACTIVE before firing update — at 99-pool scale
+    # Scenario A leaves the cluster UPDATING causing ResourceInUseException.
+    logging.info('[EKS] Waiting for cluster ACTIVE before ClusterUpdate...')
+    for _ in range(60):
+      status_out, _, status_rc = vm_util.IssueCommand(
+          util.AWS_PREFIX + [
+              'eks', 'describe-cluster',
+              '--name', self.name,
+              '--region', self.region,
+              '--query', 'cluster.status',
+              '--output', 'text',
+          ],
+          raise_on_failure=False,
+      )
+      if status_rc == 0 and status_out.strip() == 'ACTIVE':
+        logging.info('[EKS] Cluster is ACTIVE — proceeding with ClusterUpdate')
+        break
+      logging.info('[EKS] Cluster status=%s — waiting 5s...', status_out.strip())
+      time.sleep(5)
+    # Retry on ResourceInUseException race condition
+    upd_max_retries = 10
+    upd_base_delay = 30
+    for upd_attempt in range(upd_max_retries):
+      stdout, stderr, retcode = vm_util.IssueCommand(
+          upd, timeout=300, raise_on_failure=False
+      )
+      if retcode == 0:
+        break
+      if 'ResourceInUseException' in stderr and upd_attempt < upd_max_retries - 1:
+        delay = upd_base_delay * (upd_attempt + 1)
+        logging.warning(
+            '[EKS] UpdateClusterConfig ResourceInUseException — retry %d/%d in %ds',
+            upd_attempt + 1, upd_max_retries, delay,
+        )
+        time.sleep(delay)
+        continue
+      raise errors.Resource.CreationError(stderr)
+    update_id = json.loads(stdout)['update']['id']
+    return f'cluster_update:{update_id}'
+
+  def ResolveNodePoolVersions(self) -> tuple[str, str]:
+    """Returns (initial, target) EKS nodegroup versions.
+
+    Uses cluster_version (already set from FLAGS/describe-cluster) rather than
+    querying kubectl, which is faster and avoids a kubectl round-trip.
+    initial = N-1 (adjacent minor below cluster version)
+    target  = N   (cluster version = latest)
+    """
+    cluster_ver = self.cluster_version or self.k8s_version
+    # Strip any patch suffix e.g. '1.34.7' -> '1.34'
+    parts = cluster_ver.lstrip('v').split('.')
+    major, minor = int(parts[0]), int(parts[1])
+    target  = f'{major}.{minor}'
+    initial = f'{major}.{minor - 1}'
+    logging.info(
+        '[EKS] ResolveNodePoolVersions: cluster=%s initial=%s target=%s',
+        cluster_ver, initial, target,
+    )
+    return initial, target
+
+  def WaitForOperation(self, op_handle: str) -> None:
+    """Polls EKS resources until the expected terminal state is observed."""
+    kind, _, name = op_handle.partition(':')
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _wait_ng_active():
+      out, err, rc = vm_util.IssueCommand(
+          util.AWS_PREFIX
+          + [
+              'eks',
+              'describe-nodegroup',
+              '--cluster-name',
+              self.name,
+              '--nodegroup-name',
+              name,
+              '--region',
+              self.region,
+          ],
+          raise_on_failure=False,
+      )
+      if rc:
+        raise errors.Resource.RetryableCreationError(err)
+      status = json.loads(out)['nodegroup']['status']
+      if status in ('ACTIVE',):
+        return
+      if status in ('CREATE_FAILED', 'DELETE_FAILED', 'DEGRADED'):
+        raise errors.Resource.CreationError(
+            f'nodegroup {name} ended in {status}'
+        )
+      raise errors.Resource.RetryableCreationError(
+          f'nodegroup {name} status={status}'
+      )
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableDeletionError,),
+    )
+    def _wait_ng_gone():
+      _, err, rc = vm_util.IssueCommand(
+          util.AWS_PREFIX
+          + [
+              'eks',
+              'describe-nodegroup',
+              '--cluster-name',
+              self.name,
+              '--nodegroup-name',
+              name,
+              '--region',
+              self.region,
+          ],
+          raise_on_failure=False,
+      )
+      if rc and 'ResourceNotFoundException' in (err or ''):
+        return
+      if rc:
+        raise errors.Resource.RetryableDeletionError(err)
+      raise errors.Resource.RetryableDeletionError(
+          f'nodegroup {name} still present'
+      )
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _wait_cluster_update():
+      out, err, rc = vm_util.IssueCommand(
+          util.AWS_PREFIX
+          + [
+              'eks',
+              'describe-update',
+              '--name',
+              self.name,
+              '--update-id',
+              name,
+              '--region',
+              self.region,
+              '--query',
+              'update.status',
+              '--output',
+              'text',
+          ],
+          raise_on_failure=False,
+      )
+      if rc:
+        raise errors.Resource.RetryableCreationError(err)
+      status = out.strip()
+      if status == 'Successful':
+        return
+      if status in ('Failed', 'Cancelled'):
+        raise errors.Resource.CreationError(
+            f'cluster update {name} ended in {status}'
+        )
+      raise errors.Resource.RetryableCreationError(
+          f'cluster update {name} status={status}'
+      )
+
+    if kind == 'ng_active':
+      _wait_ng_active()
+    elif kind == 'ng_gone':
+      _wait_ng_gone()
+    elif kind == 'cluster_update':
+      _wait_cluster_update()
+    else:
+      raise ValueError(f'Unknown EKS op handle: {op_handle!r}')
+
+  def UpdateCluster(self) -> None:
+    """Real cluster-level update via a CloudWatch logging toggle.
+
+    Reads the current cluster logging state, flips it (enable->disable or
+    vice versa), and waits for the cluster to return to ACTIVE. Enabling all
+    five log types is a 5-10 minute control-plane op, giving a meaningful
+    overlap window for Scenario B.
+    """
+    log_types = ['api', 'audit', 'authenticator', 'controllerManager',
+                 'scheduler']
+    describe = util.AWS_PREFIX + [
+        'eks', 'describe-cluster',
+        '--name', self.name,
+        '--region', self.region,
+    ]
+    stdout, _, _ = vm_util.IssueCommand(describe)
+    info = json.loads(stdout)
+    current = info['cluster'].get('logging', {}).get('clusterLogging', [])
+    any_enabled = any(entry.get('enabled', False) for entry in current)
+    new_enabled = not any_enabled
+    logging_payload = json.dumps({
+        'clusterLogging': [
+            {'types': log_types, 'enabled': new_enabled}
+        ]
+    })
+    update = util.AWS_PREFIX + [
+        'eks', 'update-cluster-config',
+        '--name', self.name,
+        '--region', self.region,
+        '--logging', logging_payload,
+    ]
+    vm_util.IssueCommand(update, timeout=900)
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=900,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _wait_active():
+      query = util.AWS_PREFIX + [
+          'eks', 'describe-cluster',
+          '--name', self.name,
+          '--region', self.region,
+          '--query', 'cluster.status',
+          '--output', 'text',
+      ]
+      out, _, _ = vm_util.IssueCommand(query)
+      status = out.strip()
+      if status != 'ACTIVE':
+        raise errors.Resource.RetryableCreationError(
+            f'cluster status={status}'
+        )
+
+    _wait_active()
+
 
 class EksAutoCluster(BaseEksCluster):
   """Class representing an Elastic Kubernetes Service cluster with auto mode.
@@ -542,7 +1495,7 @@ class EksAutoCluster(BaseEksCluster):
   def __init__(self, spec):
     super().__init__(spec)
     self._ChooseSecondZone()
-    is_rare_gpu = self.gpu_type in _RARE_GPU_TYPES
+    is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES
     self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu
 
   def _Create(self):
@@ -574,6 +1527,34 @@ def _PostCreate(self):
 
   def _Delete(self):
     """Deletes the control plane and worker nodes."""
+    # Clean up SSH key pair — safety net in case _DeleteDependencies didn't run
+    try:
+      aws_virtual_machine.AwsKeyFileManager.DeleteKeyfile(self.region)
+    except Exception:  # pylint: disable=broad-except
+      pass
+    # Clean up dynamically created launch templates and capacity reservations
+    # Only runs if capacity reservations were actually created this run.
+    if getattr(FLAGS, 'eks_reserve_capacity_per_az', False):
+      for az in getattr(self, '_capacity_reservation_ids', {}).keys():
+        vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'ec2', 'delete-launch-template',
+                '--launch-template-name', f'pkb-eks-lt-{az}',
+                '--region', self.region,
+            ],
+            raise_on_failure=False,
+        )
+        logging.info('[EKS] Deleted launch template pkb-eks-lt-%s', az)
+      for az, res_id in getattr(self, '_capacity_reservation_ids', {}).items():
+        vm_util.IssueCommand(
+            util.AWS_PREFIX + [
+                'ec2', 'cancel-capacity-reservation',
+                '--capacity-reservation-id', res_id,
+                '--region', self.region,
+            ],
+            raise_on_failure=False,
+        )
+        logging.info('[EKS] Cancelled capacity reservation %s in %s', res_id, az)
     super()._Delete()
     cmd = [
         FLAGS.eksctl,
@@ -607,14 +1588,15 @@ def ResizeNodePool(
   def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]:
     """Get the node selectors section of a yaml for the provider."""
     del machine_type  # Unused.
-    # Theoretically needed in mixed mode, but deployments fail without it:
-    # https://docs.aws.amazon.com/eks/latest/userguide/associate-workload.html#_require_a_workload_is_deployed_to_eks_auto_mode_nodes
+    # Theoretically needed in mixed mode, but deployments fail without it.
+    # See: docs.aws.amazon.com/eks/latest/userguide/associate-workload.html
+    # #_require_a_workload_is_deployed_to_eks_auto_mode_nodes
     selectors = {'eks.amazonaws.com/compute-type': 'auto'}
     if self.use_spot:
       selectors['karpenter.sh/capacity-type'] = 'spot'
-    if self.gpu_type:
+    if virtual_machine.GPU_TYPE.value:
       selectors['eks.amazonaws.com/instance-gpu-name'] = (
-          self.gpu_type
+          virtual_machine.GPU_TYPE.value
       )
     return selectors
 
@@ -646,10 +1628,15 @@ def __init__(self, spec):
   def _Create(self):
     """Creates the control plane and worker nodes."""
     template_filename = vm_util.PrependTempDir('cloud-formation-template.yaml')
+    cfn_url = (
+        'https://raw.githubusercontent.com/aws/karpenter-provider-aws/'
+        + f'v{_KARPENTER_VERSION}/website/content/en/preview/'
+        + 'getting-started/getting-started-with-karpenter/'
+        + 'cloudformation.yaml')
     vm_util.IssueCommand([
         'curl',
         '-fsSL',
-        f'https://raw.githubusercontent.com/aws/karpenter-provider-aws/v{_KARPENTER_VERSION}/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml',
+        cfn_url,
         '-o',
         template_filename,
     ])
@@ -681,6 +1668,12 @@ def _Create(self):
     bootstrapping_nodepool.min_nodes = 1
     bootstrapping_nodepool.max_nodes = 1
     bootstrapping_nodepool.machine_type = 'm7i.2xlarge'
+    karpenter_policy_arn = (
+        f'arn:aws:iam::{self.account}:policy/'
+        + f'KarpenterControllerPolicy-{self.name}')
+    karpenter_node_role_arn = (
+        f'arn:aws:iam::{self.account}:role/'
+        + f'KarpenterNodeRole-{self.name}')
     create_json: dict[str, Any] = {
         'metadata': {
             'tags': {'karpenter.sh/discovery': self.name},
@@ -691,14 +1684,12 @@ def _Create(self):
                 'serviceAccountName': 'karpenter',
                 'roleName': f'{self.name}-karpenter',
                 'permissionPolicyARNs': [
-                    f'arn:aws:iam::{self.account}:policy/KarpenterControllerPolicy-{self.name}'
+                    karpenter_policy_arn
                 ],
             }],
         },
         'iamIdentityMappings': [{
-            'arn': (
-                f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}'
-            ),
+            'arn': karpenter_node_role_arn,
             'username': 'system:node:{{EC2PrivateDNSName}}',
             'groups': ['system:bootstrappers', 'system:nodes'],
         }],
@@ -739,15 +1730,16 @@ def _InstallAwsLoadBalancerController(self) -> None:
     policy_arn = (stdout or '').strip()
     if not policy_arn or policy_arn == 'None':
       with vm_util.NamedTemporaryFile(dir=vm_util.GetTempDir(), mode='w') as tf:
+        alb_policy_url = (
+            'https://raw.githubusercontent.com/kubernetes-sigs/'
+            + 'aws-load-balancer-controller/'
+            + 'v2.13.4/docs/install/iam_policy.json')
         vm_util.IssueCommand([
             'curl',
             '-sSL',
             '-o',
             tf.name,
-            (
-                'https://raw.githubusercontent.com/kubernetes-sigs/aws-load-balancer-controller/'
-                'v2.13.4/docs/install/iam_policy.json'
-            ),
+            alb_policy_url,
         ])
         stdout, _, _ = vm_util.IssueCommand(
             util.AWS_PREFIX
@@ -788,11 +1780,14 @@ def _InstallAwsLoadBalancerController(self) -> None:
         in stderr,
     )
     # 4) Apply CRDs
+    crds_url = (
+        'https://raw.githubusercontent.com/aws/eks-charts/master/'
+        + 'stable/aws-load-balancer-controller/crds/crds.yaml')
     kubectl.RunKubectlCommand(
         [
             'apply',
             '-f',
-            'https://raw.githubusercontent.com/aws/eks-charts/master/stable/aws-load-balancer-controller/crds/crds.yaml',
+            crds_url,
         ],
         suppress_failure=lambda stdout, stderr, retcode: 'already exists'
         in stderr,
@@ -883,7 +1878,8 @@ def _WaitForIngress(self, name: str, namespace: str, port: int) -> str:
   def _PostIngressNetworkingFixups(
       self, namespace: str, name: str, port: int, address: str
   ) -> None:
-    """Fixs ALB -> nodes connectivity to prevent 504 errors from unhealthy targets."""
+    """Fixes ALB -> node connectivity to prevent 504 errors."""
+    del namespace, name  # Unused
 
     # 1) Get ALB security group from address
     host = (
@@ -1008,7 +2004,7 @@ def _PostCreate(self):
               'daemonset/aws-node',
               '-n',
               'kube-system',
-              '--timeout=%ds' % vm_util.DEFAULT_TIMEOUT,
+              f'--timeout={vm_util.DEFAULT_TIMEOUT}s',
           ],
           timeout=vm_util.DEFAULT_TIMEOUT,
       )
@@ -1093,12 +2089,15 @@ def _PostCreate(self):
     # Get the AMI version for current kubernetes version.
     # See e.g. https://karpenter.sh/docs/tasks/managing-amis/ for not using
     # @latest.
+    ssm_ami_path = (
+        f'/aws/service/eks/optimized-ami/{self.cluster_version}/'
+        + 'amazon-linux-2023/x86_64/standard/recommended/image_id')
     image_id, _, _ = vm_util.IssueCommand([
         'aws',
         'ssm',
         'get-parameter',
         '--name',
-        f'/aws/service/eks/optimized-ami/{self.cluster_version}/amazon-linux-2023/x86_64/standard/recommended/image_id',
+        ssm_ami_path,
         '--region',
         self.region,
         '--query',
@@ -1219,7 +2218,7 @@ def _DeleteDependencies(self):
     else:
       logging.info(
           'Karpenter node role %s not found or empty response; skipping'
-          ' instance profile cleanup',
+          + ' instance profile cleanup',
           node_role,
       )
       profiles_json = {'InstanceProfiles': []}
@@ -1371,7 +2370,7 @@ def _CleanupKarpenter(self):
       for eni_id in eni_ids:
         # Bind eni_id by default to avoid loop closure issues if
         # this is refactored.
-        def _DeleteOneEni(eni_id=eni_id) -> None:
+        def _delete_one_eni(eni_id=eni_id) -> None:
           _, stderr, retcode = vm_util.IssueCommand(
               [
                   'aws',
@@ -1402,7 +2401,7 @@ def _DeleteOneEni(eni_id=eni_id) -> None:
             poll_interval=10,
             max_retries=5,
             retryable_exceptions=(errors.Resource.RetryableDeletionError,),
-        )(_DeleteOneEni)()
+        )(_delete_one_eni)()
 
   def _IsReady(self):
     """Returns True if cluster is running. Autopilot defaults to 0 nodes."""
diff --git a/perfkitbenchmarker/providers/aws/flags.py b/perfkitbenchmarker/providers/aws/flags.py
index 6871a085e5..b7f6ca214c 100644
--- a/perfkitbenchmarker/providers/aws/flags.py
+++ b/perfkitbenchmarker/providers/aws/flags.py
@@ -376,3 +376,22 @@ def _ValidatePreprovisionedDataAccess(flag_values: dict[str, Any]) -> bool:
     None,
     'If supplied, creates the DocumentDB instance from the snapshot.',
 )
+
+# Flag to skip EBS CSI driver setup during EKS cluster creation.
+# Safe for benchmarks that do not use persistent volumes (e.g. k8s_management).
+# Saves ~3 minutes per run.
+flags.DEFINE_boolean(
+    'eks_reserve_capacity_per_az',
+    False,
+    'If True, dynamically creates EC2 capacity reservations and launch '
+    'templates per AZ before nodegroup creation. Enable only for the '
+    'k8s_management benchmark. Leaving enabled for other benchmarks '
+    'wastes reserved capacity on wrong instance types.',
+)
+flags.DEFINE_boolean(
+    'eks_skip_ebs_csi',
+    False,
+    'If True, skip EBS CSI driver setup (OIDC + IAM role + addon install) '
+    'during EKS cluster creation. Safe for the k8s_management benchmark '
+    'which does not use persistent volumes. Saves ~3 minutes per run.',
+)
diff --git a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py
index 5d9bbc222b..4ce6174edd 100644
--- a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py
+++ b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py
@@ -15,9 +15,11 @@
 """Contains classes/functions related to Azure Kubernetes Service."""
 
 import json
+import time
 from typing import Any, List
 
 from absl import flags
+from absl import logging
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import provider_info
 from perfkitbenchmarker import virtual_machine
@@ -154,8 +156,7 @@ def GetResourceMetadata(self):
   def _IsAutoscalerEnabled(self, nodepool_config: container.BaseNodePoolConfig):
     """Returns True if the cluster autoscaler is enabled."""
     return (
-        nodepool_config.min_nodes
-        != nodepool_config.max_nodes
+        nodepool_config.min_nodes != nodepool_config.max_nodes
         # Auto node provisioning mode is incompatible with cluster autoscaler.
     ) and not FLAGS.azure_aks_auto_node_provisioning
 
@@ -539,6 +540,393 @@ def AddNodepool(self, batch_name, pool_id):
         spot=FLAGS.azure_low_priority_vms,
     )
 
+  def CreateNodePool(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> None:
+    """Creates a single named node pool on the cluster."""
+    node_flags = self._GetNodeFlags(nodepool_config)
+    if node_version:
+      # _GetNodeFlags may have added self.cluster_version; replace or append.
+      if '--kubernetes-version' in node_flags:
+        node_flags[node_flags.index('--kubernetes-version') + 1] = node_version
+      else:
+        node_flags += ['--kubernetes-version', node_version]
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'add',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(nodepool_config.name),
+        '--labels',
+        f'pkb_nodepool={nodepool_config.name}',
+    ] + node_flags
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=1800, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+
+  def DeleteNodePool(self, name: str) -> None:
+    """Deletes the named node pool."""
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'delete',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(name),
+    ] + self.resource_group.args
+    self._RunCreateClusterCmd(cmd)
+
+  def UpgradeNodePool(self, name: str, target_version: str) -> None:
+    """Upgrades the named node pool to target_version."""
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'upgrade',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(name),
+        '--kubernetes-version',
+        target_version,
+    ] + self.resource_group.args
+    vm_util.IssueCommand(cmd, timeout=1800)
+
+  def UpdateCluster(self) -> None:
+    """Real cluster-level update via a unique-timestamp tag change.
+
+    Triggers a control-plane operation (cluster-scoped, not pool-scoped) by
+    updating the cluster tags. Always succeeds because the tag value changes
+    every call.
+    """
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'update',
+        '--name',
+        self.name,
+        '--tags',
+        f'k8s-mgmt-ts={int(time.time())}',
+    ] + self.resource_group.args
+    vm_util.IssueCommand(cmd, timeout=1800)
+
+  # ---- Async variants (return opaque handles) -------------------------------
+
+  def CreateNodePoolAsync(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> str:
+    node_flags = self._GetNodeFlags(nodepool_config)
+    if node_version:
+      # _GetNodeFlags may have added self.cluster_version; replace or append.
+      if '--kubernetes-version' in node_flags:
+        node_flags[node_flags.index('--kubernetes-version') + 1] = node_version
+      else:
+        node_flags += ['--kubernetes-version', node_version]
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'add',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(nodepool_config.name),
+        '--labels',
+        f'pkb_nodepool={nodepool_config.name}',
+        '--no-wait',
+    ] + node_flags
+    # fix: raise timeout to 600s (AKS can take >300s to accept a
+    # --no-wait request under concurrent load) and retry on transient errors
+    # that indicate the cluster is temporarily at its concurrent-op or
+    # pool-count limit.
+    _RETRYABLE = (
+        'OperationNotAllowed',
+        'ConflictingOperationInProgress',
+        'MaxAgentPoolCountReached',
+    )
+    _MAX_RETRIES = 5
+    _RETRY_SLEEP_S = 30
+    for attempt in range(_MAX_RETRIES + 1):
+      _, stderr, retcode = vm_util.IssueCommand(
+          cmd, timeout=600, raise_on_failure=False
+      )
+      if not retcode:
+        break
+      if attempt < _MAX_RETRIES and any(e in stderr for e in _RETRYABLE):
+        logging.warning(
+            '[AKS] CreateNodePoolAsync %s: retryable error (attempt %d/%d),'
+            ' sleeping %ds: %s',
+            _AzureNodePoolName(nodepool_config.name),
+            attempt + 1, _MAX_RETRIES, _RETRY_SLEEP_S, stderr[:120],
+        )
+        time.sleep(_RETRY_SLEEP_S)
+        continue
+      raise errors.Resource.CreationError(stderr)
+    return f'np_succeeded:{_AzureNodePoolName(nodepool_config.name)}'
+
+  def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str:
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'upgrade',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(name),
+        '--kubernetes-version',
+        target_version,
+        '--no-wait',
+    ] + self.resource_group.args
+    # fix: raise timeout to 600s — az aks nodepool upgrade --no-wait
+    # can take >300s to be accepted by Azure under concurrent load.
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=600, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+    return f'np_succeeded:{_AzureNodePoolName(name)}'
+
+  def DeleteNodePoolAsync(self, name: str) -> str:
+    cmd = [
+        azure.AZURE_PATH,
+        'aks',
+        'nodepool',
+        'delete',
+        '--cluster-name',
+        self.name,
+        '--name',
+        _AzureNodePoolName(name),
+        '--no-wait',
+    ] + self.resource_group.args
+    # fix: raise timeout to 600s and treat NotFound as success.
+    # A pool that never existed or was already removed is the desired end-state
+    # for a delete — raising CreationError here caused all delete phases to
+    # fail for any pool whose create had previously failed.
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=600, raise_on_failure=False
+    )
+    if retcode:
+      if 'NotFound' in stderr or 'not found' in stderr.lower():
+        logging.info(
+            '[AKS] DeleteNodePoolAsync: %s already gone — treating as success',
+            _AzureNodePoolName(name),
+        )
+        return f'np_gone:{_AzureNodePoolName(name)}'
+      raise errors.Resource.CreationError(stderr)
+    return f'np_gone:{_AzureNodePoolName(name)}'
+
+  def UpdateClusterAsync(self) -> str:
+    """Triggers a node-count scale on the system node pool to create a
+    long-running cluster update for Scenario B overlap testing.
+
+    Scaling the system pool by ±1 node takes 3-8 minutes on AKS, which
+    creates a meaningful overlap window for the concurrent NodePool create.
+    The scale alternates +1/-1 each call so it is always a real change.
+    Falls back to a tag update if the system pool cannot be identified.
+    """
+    # Find the system node pool name
+    list_cmd = [
+        azure.AZURE_PATH, 'aks', 'nodepool', 'list',
+        '--cluster-name', self.name,
+        '--query', '[?mode==`System`].{name:name,count:count}',
+        '--output', 'json',
+    ] + self.resource_group.args
+    out, _, rc = vm_util.IssueCommand(list_cmd, raise_on_failure=False)
+    if not rc and out.strip():
+      try:
+        pools = json.loads(out.strip())
+        if pools:
+          pool_name = pools[0]['name']
+          current_count = int(pools[0]['count'])
+          # Toggle: scale to current+1 or current-1 (minimum 1)
+          new_count = current_count + 1 if current_count <= 1 else current_count - 1
+          scale_cmd = [
+              azure.AZURE_PATH, 'aks', 'nodepool', 'scale',
+              '--cluster-name', self.name,
+              '--name', pool_name,
+              '--node-count', str(new_count),
+              '--no-wait',
+          ] + self.resource_group.args
+          _, stderr, retcode = vm_util.IssueCommand(
+              scale_cmd, timeout=300, raise_on_failure=False
+          )
+          if not retcode:
+            logging.info(
+                '[AKS] UpdateClusterAsync: scaling system pool %s %d->%d',
+                pool_name, current_count, new_count,
+            )
+            return 'cluster_succeeded'
+      except (ValueError, KeyError, json.JSONDecodeError) as e:
+        logging.warning('[AKS] UpdateClusterAsync: pool parse error: %s', e)
+    # Fallback: tag update
+    logging.warning('[AKS] UpdateClusterAsync: falling back to tag update')
+    cmd = [
+        azure.AZURE_PATH, 'aks', 'update',
+        '--name', self.name,
+        '--tags', f'k8s-mgmt-ts={int(time.time())}',
+        '--no-wait',
+    ] + self.resource_group.args
+    _, stderr, retcode = vm_util.IssueCommand(
+        cmd, timeout=300, raise_on_failure=False
+    )
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+    return 'cluster_succeeded'
+
+  def ResolveNodePoolVersions(self) -> tuple[str, str]:
+    """Returns (initial, target) AKS node pool versions.
+
+    Uses cluster_version (already set) rather than querying kubectl.
+    initial = N-1 (adjacent minor below cluster version)
+    target  = N   (cluster version = latest)
+    """
+    cluster_ver = self.cluster_version or self.k8s_version
+    parts = cluster_ver.lstrip('v').split('.')
+    major, minor = int(parts[0]), int(parts[1])
+    target  = f'{major}.{minor}'
+    initial = f'{major}.{minor - 1}'
+    logging.info(
+        '[AKS] ResolveNodePoolVersions: cluster=%s initial=%s target=%s',
+        cluster_ver, initial, target,
+    )
+    return initial, target
+
+  def WaitForOperation(self, op_handle: str) -> None:
+    """Polls AKS resources until the expected terminal state is observed."""
+    kind, _, name = op_handle.partition(':')
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _wait_np_succeeded():
+      # fix: bound each individual poll call to 120s so a hung
+      # az aks nodepool show doesn't block the retry loop indefinitely.
+      out, err, rc = vm_util.IssueCommand(
+          [
+              azure.AZURE_PATH,
+              'aks',
+              'nodepool',
+              'show',
+              '--cluster-name',
+              self.name,
+              '--name',
+              name,
+              '--query',
+              'provisioningState',
+              '--output',
+              'tsv',
+          ]
+          + self.resource_group.args,
+          raise_on_failure=False,
+          timeout=120,
+      )
+      if rc:
+        if 'NotFound' in (err or '') or 'not found' in (err or '').lower():
+          raise errors.Resource.CreationError(
+              f'nodepool {name} not found while waiting for Succeeded: {err}'
+          )
+        raise errors.Resource.RetryableCreationError(err)
+      status = out.strip()
+      if status == 'Succeeded':
+        return
+      if status == 'Failed':
+        raise errors.Resource.CreationError(
+            f'nodepool {name} ended in Failed'
+        )
+      raise errors.Resource.RetryableCreationError(
+          f'nodepool {name} state={status}'
+      )
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableDeletionError,),
+    )
+    def _wait_np_gone():
+      # fix: per-poll timeout bound.
+      _, err, rc = vm_util.IssueCommand(
+          [
+              azure.AZURE_PATH,
+              'aks',
+              'nodepool',
+              'show',
+              '--cluster-name',
+              self.name,
+              '--name',
+              name,
+          ]
+          + self.resource_group.args,
+          raise_on_failure=False,
+          timeout=120,
+      )
+      if rc and ('NotFound' in (err or '') or 'not found' in (err or '').lower()):
+        return
+      if rc:
+        raise errors.Resource.RetryableDeletionError(err)
+      raise errors.Resource.RetryableDeletionError(
+          f'nodepool {name} still present'
+      )
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=3600,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _wait_cluster_succeeded():
+      # fix: per-poll timeout bound.
+      out, err, rc = vm_util.IssueCommand(
+          [
+              azure.AZURE_PATH,
+              'aks',
+              'show',
+              '--name',
+              self.name,
+              '--query',
+              'provisioningState',
+              '--output',
+              'tsv',
+          ]
+          + self.resource_group.args,
+          raise_on_failure=False,
+          timeout=120,
+      )
+      if rc:
+        raise errors.Resource.RetryableCreationError(err)
+      status = out.strip()
+      if status == 'Succeeded':
+        return
+      if status == 'Failed':
+        raise errors.Resource.CreationError('cluster update ended in Failed')
+      raise errors.Resource.RetryableCreationError(
+          f'cluster state={status}'
+      )
+
+    if kind == 'np_succeeded':
+      _wait_np_succeeded()
+    elif kind == 'np_gone':
+      _wait_np_gone()
+    elif kind == 'cluster_succeeded':
+      _wait_cluster_succeeded()
+    else:
+      raise ValueError(f'Unknown AKS op handle: {op_handle!r}')
+
 
 class AksAutomaticCluster(AksCluster):
   """Class representing an AKS Automatic cluster, which has managed node pools.
diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
index 6b0076aa69..76bd8afb97 100644
--- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
@@ -18,13 +18,16 @@
 import math
 import os
 import re
+import time
 import typing
 from typing import Any
 
 from absl import flags
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import provider_info
+from perfkitbenchmarker import virtual_machine
 from perfkitbenchmarker import virtual_machine_spec
+from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.configs import container_spec as container_spec_lib
 from perfkitbenchmarker.providers.gcp import flags as gcp_flags
 from perfkitbenchmarker.providers.gcp import gce_disk
@@ -52,8 +55,8 @@ def _CalculateCidrSize(nodes: int) -> int:
   # So 2^(32 - nodes) - 2^(32 - 20) >= 2^(32 - 24) * CIDR
   # OR CIDR <= 32 - log2(2^8 * nodes + 2^12)
   cidr_size = int(32 - math.log2((nodes << 8) + (1 << 12)))
-  # /19 is narrowest CIDR range GKE supports
-  return min(cidr_size, 19)
+  # /16 is narrowest CIDR range GKE supports
+  return min(cidr_size, 16)
 
 
 class GoogleArtifactRegistry(container_registry.BaseContainerRegistry):
@@ -259,10 +262,13 @@ def GetNodePoolNames(self) -> list[str]:
     # Command `gcloud container node-pools list` does not work for Autopilot
     # clusters - node pools are hidden and command results in 4xx.
     cmd = self._GcloudCommand('container', 'clusters', 'describe', self.name)
-    cmd.flags['flatten'] = 'nodePools'
-    cmd.flags['format'] = 'value(nodePools.name)'
+    cmd.flags['format'] = 'json'
     stdout, _, _ = cmd.Issue()
-    return stdout.split()
+    try:
+      cluster_info = json.loads(stdout)
+      return [np['name'] for np in cluster_info.get('nodePools', [])]
+    except (json.JSONDecodeError, ValueError, KeyError, TypeError):
+      return stdout.split()
 
   def GetMachineTypeFromNodeName(self, node_name: str) -> str | None:
     """Get the machine type from the node name."""
@@ -325,6 +331,8 @@ def InitializeNodePoolForCloud(
     nodepool_config.disk_size = vm_config.boot_disk_size
     nodepool_config.max_local_disks = vm_config.max_local_disks
     nodepool_config.ssd_interface = vm_config.ssd_interface
+    nodepool_config.gpu_type = vm_config.gpu_type
+    nodepool_config.gpu_count = vm_config.gpu_count
     nodepool_config.threads_per_core = vm_config.threads_per_core
     nodepool_config.gce_tags = vm_config.gce_tags
     nodepool_config.min_cpu_platform = vm_config.min_cpu_platform
@@ -360,6 +368,9 @@ def GetResourceMetadata(self) -> dict[str, Any]:
       result['gce_local_ssd_count'] = self.default_nodepool.max_local_disks
       result['gce_local_ssd_interface'] = self.default_nodepool.ssd_interface
     result['gke_nccl_fast_socket'] = self.enable_nccl_fast_socket
+    if 'nccl' in self.nodepools:
+      result['gpu_type'] = self.nodepools['nccl'].gpu_type
+      result['gpu_count'] = self.nodepools['nccl'].gpu_count
     if self.image_type:
       result['image_type'] = self.image_type
     if gcp_flags.MAX_CPU.value:
@@ -633,6 +644,339 @@ def ResizeNodePool(
       cmd.flags['node-pool'] = node_pool
     cmd.Issue()
 
+  def CreateNodePool(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> None:
+    """Creates a single named node pool on the cluster."""
+    cmd = self._GcloudCommand(
+        'container',
+        'node-pools',
+        'create',
+        nodepool_config.name,
+        '--cluster',
+        self.name,
+    )
+    self._AddNodeParamsToCmd(nodepool_config, cmd)
+    if node_version:
+      cmd.flags['node-version'] = node_version
+    self._IssueResourceCreationCommand(cmd)
+
+  def DeleteNodePool(self, name: str) -> None:
+    """Deletes the named node pool."""
+    cmd = self._GcloudCommand(
+        'container',
+        'node-pools',
+        'delete',
+        name,
+        '--cluster',
+        self.name,
+    )
+    cmd.args.append('--quiet')
+    cmd.Issue(timeout=ONE_HOUR)
+
+  def UpgradeNodePool(self, name: str, target_version: str) -> None:
+    """Upgrades the named node pool to target_version."""
+    cmd = self._GcloudCommand(
+        'container',
+        'clusters',
+        'upgrade',
+        self.name,
+        '--node-pool',
+        name,
+        '--cluster-version',
+        target_version,
+    )
+    cmd.args.append('--quiet')
+    cmd.Issue(timeout=ONE_HOUR)
+
+  def UpdateCluster(self) -> None:
+    """Real cluster-level update via a unique-timestamp label change.
+
+    Triggers an actual control-plane operation (cluster-level, not nodepool)
+    without destructively altering cluster configuration. Always succeeds
+    because the label value changes every call.
+    """
+    cmd = self._GcloudCommand('container', 'clusters', 'update', self.name)
+    cmd.flags['update-labels'] = f'k8s-mgmt-ts={int(time.time())}'
+    cmd.Issue(timeout=ONE_HOUR)
+
+  # ---- Async variants (return opaque handles) -------------------------------
+
+  def _IssueAsync(self, cmd: util.GcloudCommand) -> str:
+    """Issues a gcloud command with --async, returns the operation name."""
+    cmd.args.append('--async')
+    cmd.flags['format'] = 'value(name)'
+    stdout, stderr, retcode = cmd.Issue(timeout=600, raise_on_failure=False)
+    if retcode:
+      raise errors.Resource.CreationError(stderr)
+    op_name = stdout.strip().splitlines()[-1].strip() if stdout else ''
+    if not op_name:
+      raise errors.Resource.CreationError(
+          f'GKE async command returned no operation name; stderr={stderr}'
+      )
+    return op_name
+
+  def _GetLatestOperationName(
+      self,
+      operation_type: str = 'UPGRADE_NODES',
+      target_name: str = '',
+      max_attempts: int = 5,
+      retry_delay: int = 3,
+      op_start_time: float = 0.0,
+  ) -> str:
+    """Returns the name of the most recent matching operation for this cluster.
+
+    The async gcloud command may return before the GKE control plane has
+    transitioned the operation from PENDING to RUNNING.  For fast operations
+    (e.g. label updates) the operation may already be DONE by the time this
+    method is called.  Passing op_start_time handles both cases.
+
+    Args:
+        operation_type: GKE operationType to filter on, e.g. 'UPGRADE_NODES'
+            for node pool upgrades or 'UPDATE_CLUSTER' for cluster-level
+            updates via 'gcloud container clusters update'.
+        target_name: Substring to match against targetLink (e.g. nodepool name
+            for UPGRADE_NODES, or cluster name for UPDATE_CLUSTER).  If empty,
+            falls back to self.name (the cluster name).
+        max_attempts: Number of query attempts before giving up.
+        retry_delay: Seconds to wait between attempts.
+        op_start_time: Unix timestamp recorded just before the async gcloud
+            command was issued.  When provided, the status filter is broadened
+            to include DONE (so fast-completing operations are found) and a
+            startTime >= guard is added to avoid matching old operations.
+
+    Returns:
+        Operation name string, or empty string if none found.
+    """
+    link_target = target_name or self.name
+    if op_start_time:
+      # Fast operations (e.g. --update-labels) may be DONE before we query.
+      # Broaden the status filter and add a startTime guard (with a 30-second
+      # buffer for clock skew) to avoid picking up older completed operations.
+      from_time = time.strftime(
+          '%Y-%m-%dT%H:%M:%SZ', time.gmtime(op_start_time - 30)
+      )
+      status_filter = '(status=RUNNING OR status=PENDING OR status=DONE)'
+      time_filter = f' AND startTime>="{from_time}"'
+    else:
+      # Slow operations (e.g. node pool upgrades): only look for active ops.
+      status_filter = '(status=RUNNING OR status=PENDING)'
+      time_filter = ''
+
+    filter_str = (
+        f'operationType={operation_type} AND '
+        f'{status_filter} AND '
+        f'targetLink ~ {link_target}'
+        f'{time_filter}'
+    )
+    for attempt in range(1, max_attempts + 1):
+      list_cmd = self._GcloudCommand('container', 'operations', 'list')
+      list_cmd.flags['filter'] = filter_str
+      list_cmd.flags['sort-by'] = '~startTime'
+      list_cmd.flags['limit'] = 1
+      list_cmd.flags['format'] = 'value(name)'
+      stdout, stderr, _ = list_cmd.Issue(raise_on_failure=False)
+      op_name = stdout.strip()
+      if op_name:
+        logging.info(
+            '_GetLatestOperationName: found op %s (type=%s target=%s) '
+            '(attempt %d/%d)', op_name, operation_type, link_target,
+            attempt, max_attempts,
+        )
+        return op_name
+      logging.warning(
+          '_GetLatestOperationName: no %s op found for target=%s '
+          '(attempt %d/%d), retrying in %ds. stderr=%s',
+          operation_type, link_target, attempt, max_attempts, retry_delay,
+          stderr,
+      )
+      time.sleep(retry_delay)
+    return ''
+  
+#   def HasActiveUpgradeOperations(self) -> bool:
+#     """Checks if there are any active node pool upgrades running on the cluster."""
+#     cmd = self._GcloudCommand('container', 'operations', 'list')
+#     cmd.flags['project'] = self.project
+#     cmd.flags['zone'] = self.zone
+#     cmd.flags['filter'] = 'operationType=UPGRADE_NODES AND status=RUNNING'
+#     cmd.flags['sort-by'] = '~startTime'
+#     cmd.flags['limit'] = 1
+#     cmd.flags['format'] = 'value(name)'
+    
+    # Issue the command using PKB's native GcloudCommand wrapper
+    stdout, _, _ = cmd.Issue(raise_on_failure=False)
+    return bool(stdout.strip())
+
+  def CreateNodePoolAsync(
+      self,
+      nodepool_config: container.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> str:
+    cmd = self._GcloudCommand(
+        'container',
+        'node-pools',
+        'create',
+        nodepool_config.name,
+        '--cluster',
+        self.name,
+    )
+    self._AddNodeParamsToCmd(nodepool_config, cmd)
+    if node_version:
+      cmd.flags['node-version'] = node_version
+    # --async is incompatible with the long --timeout flag in some gcloud
+    # builds; remove it so the CLI just hands back the op name immediately.
+    cmd.flags.pop('timeout', None)
+    return self._IssueAsync(cmd)
+
+  def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str:
+    cmd = self._GcloudCommand(
+        'container',
+        'clusters',
+        'upgrade',
+        self.name,
+        '--node-pool',
+        name,
+        '--cluster-version',
+        target_version,
+    )
+    try:
+        return self._IssueAsync(cmd)
+    except errors.Resource.CreationError as e:
+        if 'returned no operation name' not in str(e):
+            raise
+        # Fallback: gcloud succeeded but printed nothing. Query the operations
+        # list scoped to this specific nodepool to find the operation name.
+        logging.warning(
+            'UpgradeNodePoolAsync: falling back to operations list for '
+            'nodepool %s. Original error: %s', name, e
+        )
+        op_name = self._GetLatestOperationName(
+            operation_type='UPGRADE_NODES', target_name=name
+        )
+        if not op_name:
+            raise
+        return op_name
+
+  def DeleteNodePoolAsync(self, name: str) -> str:
+    cmd = self._GcloudCommand(
+        'container',
+        'node-pools',
+        'delete',
+        name,
+        '--cluster',
+        self.name,
+    )
+    cmd.args.append('--quiet')
+    return self._IssueAsync(cmd)
+
+  def UpdateClusterAsync(self) -> str:
+    cmd = self._GcloudCommand('container', 'clusters', 'update', self.name)
+    cmd.flags['update-labels'] = f'k8s-mgmt-ts={int(time.time())}'
+    # 'gcloud container clusters update --async' suppresses stdout when
+    # --quiet is active (same behaviour as 'clusters upgrade'), so the
+    # operation name is never printed.  Remove --quiet here; the label-update
+    # is non-interactive so no confirmation prompt is needed.
+    cmd.flags.pop('quiet', None)
+    # Record start time BEFORE issuing.  The label-update operation completes
+    # in seconds, so it may already be DONE by the time the fallback queries
+    # the operations list.  The timestamp lets us safely include DONE ops
+    # without matching older completed operations from previous runs.
+    op_start_time = time.time()
+    try:
+      return self._IssueAsync(cmd)
+    except errors.Resource.CreationError as e:
+      if 'returned no operation name' not in str(e):
+        raise
+      # Fallback: gcloud returned retcode=0 but empty stdout.  Query the
+      # operations list including DONE status (fast label-update ops complete
+      # before we query) guarded by op_start_time to avoid stale matches.
+      logging.warning(
+          'UpdateClusterAsync: falling back to operations list for cluster %s.'
+          ' Original error: %s', self.name, e
+      )
+      op_name = self._GetLatestOperationName(
+          operation_type='UPDATE_CLUSTER',
+          target_name=self.name,
+          op_start_time=op_start_time,
+      )
+      if not op_name:
+        raise
+      return op_name
+
+  def ResolveNodePoolVersions(self) -> tuple[str, str]:
+    """Returns (initial, target) GKE node versions: initial=N-1, target=N.
+
+    GKE requires fully-qualified node versions (e.g. '1.34.4-gke.1234'),
+    so we query `gcloud container get-server-config` and pick the newest
+    valid version per minor.
+    """
+    cmd = self._GcloudCommand('container', 'get-server-config')
+    cmd.flags['format'] = 'json'
+    stdout, stderr, retcode = cmd.Issue(raise_on_failure=False)
+    if retcode:
+      raise errors.Resource.GetError(
+          f'gcloud get-server-config failed: {stderr}'
+      )
+    config = json.loads(stdout)
+    valid = list(config.get('validNodeVersions', []))
+    if not valid:
+      raise errors.Resource.GetError(
+          'GKE get-server-config returned no validNodeVersions'
+      )
+
+    def _version_tuple(v):
+      return tuple(int(x) for x in v.split('-', 1)[0].split('.'))
+
+    valid.sort(key=_version_tuple, reverse=True)
+    target = valid[0]
+    initial_minor = kubernetes_cluster.AdjacentMinorBelow(target)
+    for v in valid:
+      if kubernetes_cluster.BareMinor(v) == initial_minor:
+        return v, target
+    raise errors.Resource.GetError(
+        f'No GKE node version found for minor {initial_minor!r}; '
+        f'available top 5: {valid[:5]}'
+    )
+
+  def WaitForOperation(self, op_handle: str) -> None:
+    """Polls a GKE operation until terminal; raises on failure."""
+
+    @vm_util.Retry(
+        poll_interval=5,
+        fuzz=0,
+        timeout=ONE_HOUR,
+        retryable_exceptions=(errors.Resource.RetryableCreationError,),
+    )
+    def _poll():
+      describe = self._GcloudCommand(
+          'container',
+          'operations',
+          'describe',
+          op_handle,
+      )
+      #describe.flags['format'] = 'value(status)'
+      describe.flags['format'] = 'json'
+      out, err, rc = describe.Issue(raise_on_failure=False)
+      if rc:
+        raise errors.Resource.RetryableCreationError(
+            f'describe op failed: {err}'
+        )
+      #status = out.strip()
+      try:
+        status = json.loads(out).get('status')
+      except (json.JSONDecodeError, ValueError):
+        status = out.strip()
+      if status == 'DONE':
+        return
+      if status in ('ABORTING', 'ABORTED'):
+        raise errors.Resource.CreationError(f'op {op_handle} aborted')
+      raise errors.Resource.RetryableCreationError(
+          f'op {op_handle} status={status}'
+      )
+
+    _poll()
 
 class GkeAutopilotCluster(BaseGkeCluster):
   """Class representing an Autopilot GKE cluster, which has no nodepools."""
@@ -732,4 +1076,4 @@ def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]:
   def ResizeNodePool(
       self, new_size: int, node_pool: str = container_cluster.DEFAULT_NODEPOOL
   ):
-    raise NotImplementedError('Autopilot clusters do not support resizing.')
+    raise NotImplementedError('Autopilot clusters do not support resizing.')
\ No newline at end of file
diff --git a/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py b/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py
index 9b98d15508..fecb126114 100644
--- a/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py
+++ b/perfkitbenchmarker/resources/container_service/kubernetes_cluster.py
@@ -1,5 +1,6 @@
 """Classes related to KubernetesCluster."""
 
+import abc
 import functools
 import json
 import logging
@@ -10,7 +11,7 @@
 from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.configs import container_spec as container_spec_lib
 from perfkitbenchmarker.resources import kubernetes_inference_server
-from perfkitbenchmarker.resources.container_service import container as container_lib
+from perfkitbenchmarker.resources.container_service import (container as container_lib)
 from perfkitbenchmarker.resources.container_service import container_cluster
 from perfkitbenchmarker.resources.container_service import kubectl
 from perfkitbenchmarker.resources.container_service import kubernetes
@@ -54,6 +55,7 @@ def Create(self, restore: bool = False) -> None:
       self.inference_server.Create()
 
   def _PostCreate(self):
+    """Starts the event poller after the cluster has been created."""
     super()._PostCreate()
     if self.event_poller:
       self.event_poller.StartPolling()
@@ -151,6 +153,7 @@ def GetDefaultStorageClass(self) -> str:
 
   def GetNodeSelectors(self, machine_type: str | None = None) -> dict[str, str]:
     """Gets the node selectors section of a yaml for the provider."""
+    del machine_type  # Unused; subclasses may use it.
     return {}
 
   def ModifyPodSpecPlacementYaml(
@@ -165,9 +168,9 @@ def ModifyPodSpecPlacementYaml(
     the most likely to change from cloud to cloud.
 
     Args:
-      yaml_dicts: The list of yaml dicts to search through & modify. See
-        https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#podspec-v1-core
-          for documentation on the pod spec fields. This is modified in place.
+      yaml_dicts: The list of yaml dicts to search through & modify. See the
+        K8s PodSpec API docs for pod spec field documentation. Modified
+        in place.
       name: The name of the app.
       machine_type: A specified machine type to request.
     """
@@ -195,9 +198,8 @@ def _ModifyPodSpecPlacementYaml(
     the most likely to change from cloud to cloud.
 
     Args:
-      pod_spec_yaml: The pod spec yaml to modify. See
-        https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.34/#podspec-v1-core
-          for documentation on the pod spec fields. This is modified in place.
+      pod_spec_yaml: The pod spec yaml to modify. See the K8s PodSpec API
+        docs for pod spec field documentation. This is modified in place.
       name: The name of the app.
       machine_type: A specified machine type to request.
     """
@@ -304,9 +306,126 @@ def _GetAddressFromIngress(self, ingress_out: str):
       )
     return 'http://' + ip.strip()
 
-  def AddNodepool(self, batch_name: str, pool_id: str):
-    """Adds an additional nodepool with the given name to the cluster."""
-    pass
+  def AddNodepool(self, batch_name: str, pool_id: str) -> None:
+    """Adds a node pool; delegates to CreateNodePool for standard clusters.
+
+    Karpenter-based subclasses override this to apply a manifest instead.
+    """
+    nodepool_config = container_lib.BaseNodePoolConfig(
+        name=f'{batch_name}-{pool_id}',
+    )
+    self.CreateNodePool(nodepool_config)
+
+  def CreateNodePool(
+      self,
+      nodepool_config: container_lib.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> None:
+    """Creates a single named node pool on the cluster (blocks until ready).
+
+    Args:
+      nodepool_config: Node pool definition (name, machine type, node count).
+      node_version: Optional Kubernetes version to pin the node pool to. None
+        means use the cluster default.
+    """
+    raise NotImplementedError
+
+  def DeleteNodePool(self, name: str) -> None:
+    """Deletes the named node pool (blocks until removed)."""
+    raise NotImplementedError
+
+  def UpgradeNodePool(self, name: str, target_version: str) -> None:
+    """Upgrades the named node pool to the given Kubernetes version."""
+    raise NotImplementedError
+
+  def UpdateCluster(self) -> None:
+    """Performs a lightweight cluster-level update operation (blocks).
+
+    Intended for management-plane benchmarks that need to overlap a real
+    cluster-level operation with a node-pool operation. The implementation
+    should issue a control-plane mutation (so an actual operation runs) that
+    is non-destructive and idempotent across repeated invocations.
+    """
+    raise NotImplementedError
+
+  def CreateNodePoolAsync(
+      self,
+      nodepool_config: container_lib.BaseNodePoolConfig,
+      node_version: str | None = None,
+  ) -> str:
+    """Initiates node-pool create; returns opaque op handle. Does NOT wait."""
+    raise NotImplementedError
+
+  def UpgradeNodePoolAsync(self, name: str, target_version: str) -> str:
+    """Initiates node-pool upgrade; returns opaque op handle. Does NOT wait."""
+    raise NotImplementedError
+
+  def DeleteNodePoolAsync(self, name: str) -> str:
+    """Initiates node-pool delete; returns opaque op handle. Does NOT wait."""
+    raise NotImplementedError
+
+  def UpdateClusterAsync(self) -> str:
+    """Initiates cluster-level update. Returns op handle; does NOT wait."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def GetNodePoolNames(self) -> list[str]:
+    """Returns the names of all node pools currently in the cluster.
+
+    Used by the kubernetes_management benchmark to:
+      - Sweep stale pkbm* pools before each run (clean-start spec requirement)
+      - Re-list live pools after creates before deleting (avoids stale names)
+    """
+
+  def WaitForOperation(self, op_handle: str) -> None:
+    """Blocks until the operation identified by op_handle completes.
+
+    Args:
+      op_handle: provider-specific opaque string from one of the *Async
+        methods above.
+
+    Raises:
+      errors.Resource.RetryableCreationError or similar on timeout/failure.
+    """
+    raise NotImplementedError
+
+  def ResolveNodePoolVersions(self) -> tuple[str, str]:
+    """Returns (initial, target) K8s versions per benchmark spec.
+
+    Spec contract:
+      target  = cluster's current K8s version (the latest available)
+      initial = the adjacent minor below target (e.g., target=1.35 -> 1.34)
+    Default implementation returns bare-minor strings ("1.34", "1.35") which
+    EKS and AKS accept directly. Providers requiring fully-qualified versions
+    (notably GKE) must override.
+    """
+    target = BareMinor(self.k8s_version)
+    initial = AdjacentMinorBelow(self.k8s_version)
+    return initial, target
+
+
+def BareMinor(version: str) -> str:
+  """Returns the 'major.minor' part of a K8s version string.
+
+  Accepts and normalizes formats like 'v1.35.4', '1.35.4-gke.1234', '1.35'.
+  """
+  if version.startswith('v'):
+    version = version[1:]
+  bare = version.split('-', 1)[0]
+  parts = bare.split('.')
+  if len(parts) < 2 or not parts[0].isdigit() or not parts[1].isdigit():
+    raise ValueError(f'Cannot parse K8s version: {version!r}')
+  return f'{parts[0]}.{parts[1]}'
+
+
+def AdjacentMinorBelow(version: str) -> str:
+  """Returns the bare minor one below the given version: '1.35.4' -> '1.34'."""
+  bare = BareMinor(version)
+  major_s, minor_s = bare.split('.')
+  minor = int(minor_s)
+  if minor <= 0:
+    raise ValueError(f'No adjacent minor below {version!r}')
+  return f'{major_s}.{minor - 1}'
 
 
 def _DeleteAllFromDefaultNamespace():
diff --git a/tests/linux_benchmarks/kubernetes_management_benchmark_test.py b/tests/linux_benchmarks/kubernetes_management_benchmark_test.py
new file mode 100644
index 0000000000..6852c8df46
--- /dev/null
+++ b/tests/linux_benchmarks/kubernetes_management_benchmark_test.py
@@ -0,0 +1,1105 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for linux_benchmarks.kubernetes_management_benchmark."""
+
+# pylint: disable=invalid-name,protected-access
+
+import threading
+import time
+import unittest
+from unittest import mock
+
+from absl import flags
+from absl.testing import flagsaver
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import sample
+from perfkitbenchmarker.linux_benchmarks import kubernetes_management_benchmark
+from perfkitbenchmarker.resources.container_service import kubernetes_cluster
+from tests import pkb_common_test_case
+
+FLAGS = flags.FLAGS
+
+_CLUSTER_NAME = 'test-cluster'
+
+
+def _make_sample(metric, value, unit='seconds', metadata=None):
+  return sample.Sample(metric, value, unit, metadata or {})
+
+
+def _make_mock_cluster(
+    name=_CLUSTER_NAME,
+    k8s_version='1.34',
+    pool_names=None,
+):
+  """Creates a fully-stubbed KubernetesCluster mock for use in tests."""
+  cluster = mock.create_autospec(
+      kubernetes_cluster.KubernetesCluster, instance=True
+  )
+  cluster.name = name
+  cluster.k8s_version = k8s_version
+  cluster.cluster_version = k8s_version
+  cluster.GetNodePoolNames.return_value = pool_names or []
+  cluster.ResolveNodePoolVersions.return_value = ('1.33', '1.34')
+  cluster.CreateNodePoolAsync.return_value = 'op-create-1'
+  cluster.UpgradeNodePoolAsync.return_value = 'op-upgrade-1'
+  cluster.DeleteNodePoolAsync.return_value = 'op-delete-1'
+  cluster.UpdateClusterAsync.return_value = 'op-update-1'
+  cluster.WaitForOperation.return_value = None
+  default_np = mock.MagicMock()
+  default_np.machine_type = 'e2-standard-2'
+  default_np.num_nodes = 1
+  default_np.min_nodes = 1
+  default_np.max_nodes = 1
+  default_np.zone = 'us-central1-a'
+  default_np.disk_size = 100
+  default_np.name = 'default-pool'
+  cluster.default_nodepool = default_np
+  return cluster
+
+
+def _make_mock_benchmark_spec(cluster=None):
+  spec = mock.MagicMock()
+  spec.container_cluster = cluster or _make_mock_cluster()
+  return spec
+
+
+def _make_mock_config(cluster_type='Kubernetes'):
+  cfg = mock.MagicMock()
+  cfg.container_cluster.type = cluster_type
+  return cfg
+
+
+class ScenarioNameTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for _SCENARIO_A_NAME, _SCENARIO_B_NAME, _SCENARIO_C_NAME."""
+
+  def testScenarioANameZeroPadsToThreeDigits(self):
+    self.assertEqual(
+        'pkbma000',
+        kubernetes_management_benchmark._ScenarioAName(0),
+    )
+
+  def testScenarioANameTwoDigitIndex(self):
+    self.assertEqual(
+        'pkbma042',
+        kubernetes_management_benchmark._ScenarioAName(42),
+    )
+
+  def testScenarioANameMaxThreeDigits(self):
+    self.assertEqual(
+        'pkbma999',
+        kubernetes_management_benchmark._ScenarioAName(999),
+    )
+
+  def testScenarioBNameIsConstant(self):
+    self.assertEqual(
+        'pkbmb',
+        kubernetes_management_benchmark._SCENARIO_B_NAME,
+    )
+
+  def testScenarioCNameZeroPadsToFourDigits(self):
+    self.assertEqual(
+        'pkbmc0000',
+        kubernetes_management_benchmark._ScenarioCName(0),
+    )
+
+  def testScenarioCNameSingleDigitIndex(self):
+    self.assertEqual(
+        'pkbmc0007',
+        kubernetes_management_benchmark._ScenarioCName(7),
+    )
+
+  def testScenarioCNameFourDigitIndex(self):
+    self.assertEqual(
+        'pkbmc1000',
+        kubernetes_management_benchmark._ScenarioCName(1000),
+    )
+
+  def testAllNamesWithinAksLimit(self):
+    for i in range(1000):
+      self.assertLessEqual(
+          len(kubernetes_management_benchmark._ScenarioAName(i)), 12
+      )
+    for i in range(10000):
+      self.assertLessEqual(
+          len(kubernetes_management_benchmark._ScenarioCName(i)), 12
+      )
+    self.assertLessEqual(
+        len(kubernetes_management_benchmark._SCENARIO_B_NAME), 12
+    )
+
+
+class CheckPrerequisitesTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the CheckPrerequisites validation function."""
+
+  def testValidScenariosPass(self):
+    with flagsaver.flagsaver(k8s_mgmt_scenarios=['A', 'B', 'C']):
+      kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+  def testInvalidScenarioRaises(self):
+    with flagsaver.flagsaver(k8s_mgmt_scenarios=['X']):
+      with self.assertRaises(errors.Config.InvalidValue):
+        kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+  def testMixedValidInvalidRaises(self):
+    with flagsaver.flagsaver(k8s_mgmt_scenarios=['A', 'Z']):
+      with self.assertRaises(errors.Config.InvalidValue):
+        kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+  def testNonKubernetesClusterTypeRaises(self):
+    with flagsaver.flagsaver(k8s_mgmt_scenarios=['A']):
+      with self.assertRaises(errors.Config.InvalidValue):
+        kubernetes_management_benchmark.CheckPrerequisites(
+            _make_mock_config(cluster_type='Mesos')
+        )
+
+  def testInvalidScaleSweepRaises(self):
+    with flagsaver.flagsaver(
+        k8s_mgmt_scenarios=['C'], k8s_mgmt_scale_sweep=['10', 'abc']
+    ):
+      with self.assertRaises(errors.Config.InvalidValue):
+        kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+  def testValidScaleSweepPasses(self):
+    with flagsaver.flagsaver(
+        k8s_mgmt_scenarios=['C'], k8s_mgmt_scale_sweep=['10', '50', '100']
+    ):
+      kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+  def testLowercaseScenarioRaises(self):
+    with flagsaver.flagsaver(k8s_mgmt_scenarios=['a']):
+      with self.assertRaises(errors.Config.InvalidValue):
+        kubernetes_management_benchmark.CheckPrerequisites(_make_mock_config())
+
+
+class PrepareTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the Prepare benchmark lifecycle function."""
+
+  def _patch_kubectl(self, rc=0):
+    return mock.patch(
+        'perfkitbenchmarker.resources.container_service.kubectl'
+        + '.RunKubectlCommand',
+        return_value=('', '', rc),
+    )
+
+  def testPrepareRunsKubectlSleepPod(self):
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl() as mock_kubectl:
+      kubernetes_management_benchmark.Prepare(bm_spec)
+      mock_kubectl.assert_called_once()
+      args = mock_kubectl.call_args[0][0]
+      self.assertIn('run', args)
+      self.assertIn('pkb-mgmt-sleep', args)
+      self.assertIn('sleep', args)
+
+  def testPrepareSetsAlwaysCallCleanup(self):
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl():
+      kubernetes_management_benchmark.Prepare(bm_spec)
+    self.assertTrue(bm_spec.always_call_cleanup)
+
+  def testPrepareToleratesKubectlNonZeroReturn(self):
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl(rc=1):
+      kubernetes_management_benchmark.Prepare(bm_spec)
+
+
+class CleanupTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the Cleanup benchmark lifecycle function."""
+
+  def _patch_kubectl(self):
+    return mock.patch(
+        'perfkitbenchmarker.resources.container_service.kubectl'
+        + '.RunKubectlCommand',
+        return_value=('', '', 0),
+    )
+
+  def testCleanupDeletesSleepPod(self):
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl() as mock_kubectl:
+      kubernetes_management_benchmark.Cleanup(bm_spec)
+      delete_calls = [
+          str(c) for c in mock_kubectl.call_args_list
+          if 'pkb-mgmt-sleep' in str(c)
+      ]
+      self.assertNotEmpty(delete_calls)
+
+  def testCleanupDeletesAllPkbmPrefixedPools(self):
+    cluster = _make_mock_cluster(
+        pool_names=['pkbma000', 'default-pool', 'pkbmc0001']
+    )
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl():
+      kubernetes_management_benchmark.Cleanup(bm_spec)
+    deleted = {c.args[0] for c in cluster.DeleteNodePool.call_args_list}
+    self.assertIn('pkbma000', deleted)
+    self.assertIn('pkbmc0001', deleted)
+    self.assertNotIn('default-pool', deleted)
+
+  def testCleanupSkipsDeleteWhenNoLeftoverPools(self):
+    cluster = _make_mock_cluster(pool_names=['default-pool'])
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with self._patch_kubectl():
+      kubernetes_management_benchmark.Cleanup(bm_spec)
+    cluster.DeleteNodePool.assert_not_called()
+
+  def testCleanupHandlesNoneCluster(self):
+    bm_spec = _make_mock_benchmark_spec()
+    bm_spec.container_cluster = None
+    kubernetes_management_benchmark.Cleanup(bm_spec)
+
+
+class CleanStartSweepTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _CleanStartSweep helper function."""
+
+  def testDeletesStalePkbmPools(self):
+    cluster = _make_mock_cluster(
+        pool_names=['pkbma000', 'pkbmc0001', 'user-pool']
+    )
+    kubernetes_management_benchmark._CleanStartSweep(cluster)
+    deleted = {c.args[0] for c in cluster.DeleteNodePool.call_args_list}
+    self.assertIn('pkbma000', deleted)
+    self.assertIn('pkbmc0001', deleted)
+    self.assertNotIn('user-pool', deleted)
+
+  def testDoesNothingWhenNoPkbmPools(self):
+    cluster = _make_mock_cluster(pool_names=['user-pool', 'default-pool'])
+    kubernetes_management_benchmark._CleanStartSweep(cluster)
+    cluster.DeleteNodePool.assert_not_called()
+
+  def testToleratesGetNodePoolNamesException(self):
+    cluster = _make_mock_cluster()
+    cluster.GetNodePoolNames.side_effect = RuntimeError('API error')
+    kubernetes_management_benchmark._CleanStartSweep(cluster)
+
+
+class ResultsTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _Results result-accumulator helper."""
+
+  def testAddSingleEntry(self):
+    r = kubernetes_management_benchmark._Results()
+    r.add('op1', 0.1, 1.0, None)
+    self.assertLen(r.entries, 1)
+    name, init, e2e, err = r.entries[0]
+    self.assertEqual('op1', name)
+    self.assertAlmostEqual(0.1, init, places=5)
+    self.assertAlmostEqual(1.0, e2e, places=5)
+    self.assertIsNone(err)
+
+  def testAddMultipleEntries(self):
+    r = kubernetes_management_benchmark._Results()
+    r.add('op1', 0.1, 1.0, None)
+    r.add('op2', 0.2, 2.0, ValueError('fail'))
+    self.assertLen(r.entries, 2)
+
+  def testAddIsThreadSafe(self):
+    """Tests that concurrent add() calls from multiple threads are safe."""
+    r = kubernetes_management_benchmark._Results()
+    n = 100
+
+    def _add(i):
+      r.add(f'op{i}', float(i), float(i) * 2, None)
+
+    threads = [threading.Thread(target=_add, args=(i,)) for i in range(n)]
+    for t in threads:
+      t.start()
+    for t in threads:
+      t.join()
+    self.assertLen(r.entries, n)
+
+  def testAddPreservesError(self):
+    r = kubernetes_management_benchmark._Results()
+    exc = RuntimeError('test error')
+    r.add('failing-op', 0.5, 0.5, exc)
+    _, _, _, err = r.entries[0]
+    self.assertIs(exc, err)
+
+
+class TimedAsyncTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _TimedAsync timing helper."""
+
+  def testSuccessfulKickoffAndWait(self):
+    kickoff = mock.Mock(return_value='op-handle')
+    wait_fn = mock.Mock(return_value=None)
+    init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync(
+        kickoff, wait_fn
+    )
+    kickoff.assert_called_once()
+    wait_fn.assert_called_once_with('op-handle')
+    self.assertIsNone(err)
+    self.assertGreaterEqual(init_lat, 0.0)
+    self.assertGreaterEqual(e2e_lat, init_lat)
+
+  def testKickoffFailureReturnsError(self):
+    exc = RuntimeError('kickoff failed')
+    kickoff = mock.Mock(side_effect=exc)
+    wait_fn = mock.Mock()
+    init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync(
+        kickoff, wait_fn
+    )
+    self.assertIs(exc, err)
+    wait_fn.assert_not_called()
+    self.assertAlmostEqual(init_lat, e2e_lat, places=2)
+
+  def testWaitFailureReturnsError(self):
+    exc = RuntimeError('wait failed')
+    kickoff = mock.Mock(return_value='op-handle')
+    wait_fn = mock.Mock(side_effect=exc)
+    _, e2e_lat, err = kubernetes_management_benchmark._TimedAsync(
+        kickoff, wait_fn
+    )
+    self.assertIs(exc, err)
+    self.assertGreater(e2e_lat, 0.0)
+
+  def testInitLatencyNotGreaterThanE2eLatency(self):
+    kickoff = mock.Mock(return_value='handle')
+    wait_fn = mock.Mock(side_effect=lambda _: time.sleep(0.01))
+    init_lat, e2e_lat, err = kubernetes_management_benchmark._TimedAsync(
+        kickoff, wait_fn
+    )
+    self.assertIsNone(err)
+    self.assertLessEqual(init_lat, e2e_lat)
+
+  def testHandlePassedToWaitFn(self):
+    kickoff = mock.Mock(return_value='my-op-handle')
+    wait_fn = mock.Mock()
+    kubernetes_management_benchmark._TimedAsync(kickoff, wait_fn)
+    wait_fn.assert_called_once_with('my-op-handle')
+
+
+class RunAsyncTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _RunAsync concurrent execution helper."""
+
+  def testEmptyItemsReturnsEmptyList(self):
+    results = kubernetes_management_benchmark._RunAsync(
+        kickoff=mock.Mock(),
+        wait_fn=mock.Mock(),
+        items=[],
+        get_name=str,
+    )
+    self.assertEmpty(results)
+
+  @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50)
+  def testReturnsOneResultPerItem(self):
+    kickoff = mock.Mock(return_value='op-handle')
+    wait_fn = mock.Mock(return_value=None)
+    results = kubernetes_management_benchmark._RunAsync(
+        kickoff=kickoff, wait_fn=wait_fn, items=['a', 'b', 'c'], get_name=str
+    )
+    self.assertLen(results, 3)
+    self.assertEqual({'a', 'b', 'c'}, {name for name, _, _, _ in results})
+
+  @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50)
+  def testKickoffErrorCapturedInResults(self):
+    kickoff = mock.Mock(side_effect=RuntimeError('kaboom'))
+    results = kubernetes_management_benchmark._RunAsync(
+        kickoff=kickoff, wait_fn=mock.Mock(), items=['x'], get_name=str
+    )
+    self.assertLen(results, 1)
+    _, _, _, err = results[0]
+    self.assertIsNotNone(err)
+
+  @flagsaver.flagsaver(k8s_mgmt_max_concurrent=2)
+  def testConcurrencyCapDoesNotDropItems(self):
+    results = kubernetes_management_benchmark._RunAsync(
+        kickoff=mock.Mock(return_value='op'),
+        wait_fn=mock.Mock(return_value=None),
+        items=list(range(5)),
+        get_name=str,
+    )
+    self.assertLen(results, 5)
+
+  @flagsaver.flagsaver(k8s_mgmt_max_concurrent=50)
+  def testGetNameCallableApplied(self):
+    cfg = mock.MagicMock()
+    cfg.name = 'poolname'
+    results = kubernetes_management_benchmark._RunAsync(
+        kickoff=mock.Mock(return_value='h'),
+        wait_fn=mock.Mock(),
+        items=[cfg],
+        get_name=lambda c: c.name,
+    )
+    name, _, _, _ = results[0]
+    self.assertEqual('poolname', name)
+
+
+class MakeNodePoolConfigTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _MakeNodePoolConfig factory."""
+
+  @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=3)
+  def testNameIsSet(self):
+    cluster = _make_mock_cluster()
+    cfg = kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'mypool')
+    self.assertEqual('mypool', cfg.name)
+
+  @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=3)
+  def testNumNodesComesFromFlag(self):
+    cluster = _make_mock_cluster()
+    cfg = kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'p')
+    self.assertEqual(3, cfg.num_nodes)
+    self.assertEqual(3, cfg.min_nodes)
+    self.assertEqual(3, cfg.max_nodes)
+
+  @flagsaver.flagsaver(k8s_mgmt_nodes_per_nodepool=1)
+  def testDoesNotMutateDefaultNodepool(self):
+    cluster = _make_mock_cluster()
+    original_name = cluster.default_nodepool.name
+    kubernetes_management_benchmark._MakeNodePoolConfig(cluster, 'newname')
+    self.assertEqual(original_name, cluster.default_nodepool.name)
+
+
+class OpSamplesTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _OpSamples sample-generation helper."""
+
+  def testEmptyResultsYieldsSuccessRateOfZero(self):
+    samples = kubernetes_management_benchmark._OpSamples(
+        'PrefixOp', [], attempted_ops=5
+    )
+    rate = next(s for s in samples if s.metric == 'PrefixOp_SuccessRate')
+    self.assertEqual(0.0, rate.value)
+
+  def testPerOpInitiationAndE2eSamplesGenerated(self):
+    results = [('op1', 0.1, 1.0, None), ('op2', 0.2, 2.0, None)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'MyOp', results, attempted_ops=2
+    )
+    metrics = [s.metric for s in samples]
+    self.assertIn('MyOp_InitiationLatency', metrics)
+    self.assertIn('MyOp_EndToEndLatency', metrics)
+
+  def testSuccessRateHundredPercentWhenAllSucceed(self):
+    results = [('op1', 1.0, 2.0, None), ('op2', 0.5, 1.5, None)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=2
+    )
+    rate = next(s for s in samples if s.metric == 'Op_SuccessRate')
+    self.assertAlmostEqual(100.0, rate.value)
+
+  def testSuccessRateFiftyPercentWhenHalfFail(self):
+    results = [
+        ('op1', 1.0, 2.0, None),
+        ('op2', 0.5, 0.5, RuntimeError('fail')),
+    ]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=2
+    )
+    rate = next(s for s in samples if s.metric == 'Op_SuccessRate')
+    self.assertAlmostEqual(50.0, rate.value)
+
+  def testAttemptedOpsExceedingExecutedOpsLowersRate(self):
+    results = [('op1', 1.0, 2.0, None)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=3
+    )
+    rate = next(s for s in samples if s.metric == 'Op_SuccessRate')
+    self.assertAlmostEqual(100.0 / 3, rate.value, places=3)
+
+  def testSuccessRateMetadataFields(self):
+    results = [('op1', 1.0, 2.0, None), ('op2', 0.5, 0.5, Exception('err'))]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=3
+    )
+    rate = next(s for s in samples if s.metric == 'Op_SuccessRate')
+    self.assertEqual('3', rate.metadata['total_ops'])
+    self.assertEqual('2', rate.metadata['executed_ops'])
+    self.assertEqual('1', rate.metadata['successful_ops'])
+    self.assertEqual('1', rate.metadata['skipped_ops'])
+
+  def testFailedOpIncludesErrorMessage(self):
+    results = [('fail-op', 0.5, 0.5, RuntimeError('oops'))]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=1
+    )
+    init_s = next(s for s in samples if s.metric == 'Op_InitiationLatency')
+    self.assertIn('error', init_s.metadata)
+    self.assertIn('oops', init_s.metadata['error'])
+
+  def testAggregatesGeneratedForTwoOrMoreSuccesses(self):
+    results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 4)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=3
+    )
+    metrics = [s.metric for s in samples]
+    self.assertIn('Op_InitiationLatency_Mean', metrics)
+    self.assertIn('Op_EndToEndLatency_Mean', metrics)
+
+  def testAggregatesNotGeneratedForSingleSuccess(self):
+    results = [('op1', 1.0, 2.0, None)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=1
+    )
+    self.assertNotIn('Op_InitiationLatency_Mean',
+                     [s.metric for s in samples])
+
+  def testOutliersGeneratedForFourOrMoreSuccesses(self):
+    results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 6)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=5
+    )
+    metrics = [s.metric for s in samples]
+    self.assertIn('Op_InitiationLatency_OutlierCount', metrics)
+    self.assertIn('Op_EndToEndLatency_OutlierCount', metrics)
+
+  def testOutliersNotGeneratedForThreeOrFewerSuccesses(self):
+    results = [(f'op{i}', float(i), float(i) * 2, None) for i in range(1, 4)]
+    samples = kubernetes_management_benchmark._OpSamples(
+        'Op', results, attempted_ops=3
+    )
+    self.assertNotIn('Op_InitiationLatency_OutlierCount',
+                     [s.metric for s in samples])
+
+
+class AggregateSamplesTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _AggregateSamples statistics helper."""
+
+  def testProducesAllExpectedStatMetrics(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Pfx', 'InitiationLatency', [1.0, 2.0, 3.0, 4.0, 5.0]
+    )
+    metrics = {s.metric for s in samples}
+    for label in ('Mean', 'StdDev', 'Min', 'Median', 'P90', 'P99', 'Max'):
+      self.assertIn(f'Pfx_InitiationLatency_{label}', metrics)
+
+  def testMeanValueCorrect(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0]
+    )
+    mean_s = next(s for s in samples if 'Mean' in s.metric)
+    self.assertAlmostEqual(3.0, mean_s.value, places=3)
+
+  def testMinValueCorrect(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Op', 'E2E', [10.0, 20.0, 30.0]
+    )
+    min_s = next(s for s in samples if 'Min' in s.metric)
+    self.assertAlmostEqual(10.0, min_s.value, places=3)
+
+  def testMaxValueCorrect(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Op', 'E2E', [10.0, 20.0, 30.0]
+    )
+    max_s = next(s for s in samples if 'Max' in s.metric)
+    self.assertAlmostEqual(30.0, max_s.value, places=3)
+
+  def testSampleCountInMetadata(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Op', 'E2E', [1.0, 2.0, 3.0]
+    )
+    for s in samples:
+      self.assertEqual('3', s.metadata.get('sample_count'))
+
+  def testUnitsAreSeconds(self):
+    samples = kubernetes_management_benchmark._AggregateSamples(
+        'Op', 'E2E', [1.0, 2.0]
+    )
+    for s in samples:
+      self.assertEqual('seconds', s.unit)
+
+
+class OutlierSamplesTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _OutlierSamples IQR-based outlier detection helper."""
+
+  def testNoOutliersYieldsZeroCount(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', [1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
+    )
+    self.assertLen(samples, 1)
+    self.assertEqual(0, samples[0].value)
+
+  def testClearOutlierDetected(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', [1.0, 1.0, 1.0, 1.0, 100.0]
+    )
+    self.assertEqual(1, samples[0].value)
+
+  def testMetricNameFormatted(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'MyPrefix', 'InitiationLatency', [1.0, 2.0, 3.0, 4.0]
+    )
+    self.assertEqual(
+        'MyPrefix_InitiationLatency_OutlierCount', samples[0].metric
+    )
+
+  def testMetadataContainsFenceFields(self):
+    meta = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0]
+    )[0].metadata
+    for field in ('q1', 'q3', 'iqr', 'upper_fence', 'lower_fence',
+                  'sample_count'):
+      self.assertIn(field, meta)
+
+  def testSampleCountInMetadata(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', [1.0, 2.0, 3.0, 4.0, 5.0]
+    )
+    self.assertEqual('5', samples[0].metadata['sample_count'])
+
+  def testUnitIsCount(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', [1.0, 2.0, 3.0, 4.0]
+    )
+    self.assertEqual('count', samples[0].unit)
+
+  def testReturnsSingleSample(self):
+    samples = kubernetes_management_benchmark._OutlierSamples(
+        'Op', 'E2E', list(range(1, 11))
+    )
+    self.assertLen(samples, 1)
+
+
+class RunTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the Run benchmark entry-point function."""
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['A', 'B', 'C'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunCallsCleanStartSweep(self):
+    """Tests that Run invokes _CleanStartSweep before executing scenarios."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ) as mock_clean, mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ):
+      kubernetes_management_benchmark.Run(bm_spec)
+    mock_clean.assert_called_once_with(cluster)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['A'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunOnlyScenarioACallsOnlyA(self):
+    """Tests that Run only calls _RunScenarioA when scenarios=['A']."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ) as mock_a, mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ) as mock_b, mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ) as mock_c:
+      kubernetes_management_benchmark.Run(bm_spec)
+    mock_a.assert_called_once()
+    mock_b.assert_not_called()
+    mock_c.assert_not_called()
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['B'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunOnlyScenarioBCallsOnlyB(self):
+    """Tests that Run only calls _RunScenarioB when scenarios=['B']."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ) as mock_a, mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ) as mock_b, mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ) as mock_c:
+      kubernetes_management_benchmark.Run(bm_spec)
+    mock_a.assert_not_called()
+    mock_b.assert_called_once()
+    mock_c.assert_not_called()
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['C'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=42,
+  )
+  def testRunScenarioCPassesLargeScaleFlag(self):
+    """Tests that Run passes the large-scale-nodepools flag to _RunScenarioC."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ) as mock_c:
+      kubernetes_management_benchmark.Run(bm_spec)
+    mock_c.assert_called_once()
+    _, _, scale = mock_c.call_args.args
+    self.assertEqual(42, scale)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['C'],
+      k8s_mgmt_scale_sweep=['10', '50'],
+      k8s_mgmt_large_scale_nodepools=100,
+  )
+  def testRunScenarioCScaleSweepRunsTwice(self):
+    """Tests that Run calls _RunScenarioC once per scale in the sweep."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark,
+        '_RunScenarioC',
+        return_value=[_make_sample('m', 1.0)],
+    ) as mock_c:
+      kubernetes_management_benchmark.Run(bm_spec)
+    self.assertEqual(2, mock_c.call_count)
+    scales = [call.args[2] for call in mock_c.call_args_list]
+    self.assertIn(10, scales)
+    self.assertIn(50, scales)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['C'],
+      k8s_mgmt_scale_sweep=['10'],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunTagsScenarioCScaleInMetadata(self):
+    """Tests that Run adds scenario_c_scale to each sample's metadata."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    test_sample = _make_sample('metric', 1.0)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioA', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark,
+        '_RunScenarioC',
+        return_value=[test_sample],
+    ):
+      samples = kubernetes_management_benchmark.Run(bm_spec)
+    self.assertIn('scenario_c_scale', samples[0].metadata)
+    self.assertEqual('10', samples[0].metadata['scenario_c_scale'])
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['A'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunTagsAllSamplesWithRunMetadata(self):
+    """Tests that Run adds version and config keys to all sample metadata."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    test_sample = _make_sample('m', 1.0)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark,
+        '_RunScenarioA',
+        return_value=[test_sample],
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ):
+      samples = kubernetes_management_benchmark.Run(bm_spec)
+    meta = samples[0].metadata
+    for key in ('initial_version', 'target_version', 'cluster_k8s_version',
+                'nodes_per_nodepool', 'concurrent_nodepools'):
+      self.assertIn(key, meta)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['A'],
+      k8s_mgmt_initial_version='1.30',
+      k8s_mgmt_target_version='1.31',
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunUsesExplicitVersionFlags(self):
+    """Tests that Run uses explicit version flags over auto-resolved ones."""
+    cluster = _make_mock_cluster()
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark,
+        '_RunScenarioA',
+        return_value=[_make_sample('m', 1.0)],
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ):
+      samples = kubernetes_management_benchmark.Run(bm_spec)
+    cluster.ResolveNodePoolVersions.assert_not_called()
+    self.assertEqual('1.30', samples[0].metadata['initial_version'])
+    self.assertEqual('1.31', samples[0].metadata['target_version'])
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_scenarios=['A'],
+      k8s_mgmt_scale_sweep=[],
+      k8s_mgmt_large_scale_nodepools=10,
+  )
+  def testRunAutoResolvesVersionsWhenFlagsAbsent(self):
+    """Tests Run calls ResolveNodePoolVersions when version flags absent."""
+    cluster = _make_mock_cluster()
+    cluster.ResolveNodePoolVersions.return_value = ('1.33', '1.34')
+    bm_spec = _make_mock_benchmark_spec(cluster)
+    with mock.patch.object(
+        kubernetes_management_benchmark, '_CleanStartSweep'
+    ), mock.patch.object(
+        kubernetes_management_benchmark,
+        '_RunScenarioA',
+        return_value=[_make_sample('m', 1.0)],
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioB', return_value=[]
+    ), mock.patch.object(
+        kubernetes_management_benchmark, '_RunScenarioC', return_value=[]
+    ):
+      samples = kubernetes_management_benchmark.Run(bm_spec)
+    cluster.ResolveNodePoolVersions.assert_called_once()
+    self.assertEqual('1.33', samples[0].metadata['initial_version'])
+    self.assertEqual('1.34', samples[0].metadata['target_version'])
+
+
+class RunScenarioATest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _RunScenarioA phase-by-phase and pipelined modes."""
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_concurrent_nodepools=2,
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+      k8s_mgmt_pipeline_scenario_a=False,
+  )
+  def testPhaseByPhaseProducesCreateUpgradeDeleteSamples(self):
+    """Tests Scenario A produces Create, Upgrade, and Delete samples."""
+    cluster = _make_mock_cluster(pool_names=['pkbma000', 'pkbma001'])
+    samples = kubernetes_management_benchmark._RunScenarioA(
+        cluster, '1.33', '1.34'
+    )
+    metrics = {s.metric for s in samples}
+    self.assertTrue(any('ScenarioA_Create' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Delete' in m for m in metrics))
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_concurrent_nodepools=2,
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+      k8s_mgmt_pipeline_scenario_a=False,
+  )
+  def testPhaseByPhasePassesInitialVersionToCreate(self):
+    """Tests _RunScenarioA passes initial_version to CreateNodePoolAsync."""
+    cluster = _make_mock_cluster(pool_names=['pkbma000', 'pkbma001'])
+    kubernetes_management_benchmark._RunScenarioA(cluster, '1.33', '1.34')
+    for call in cluster.CreateNodePoolAsync.call_args_list:
+      kw = call.kwargs if call.kwargs else {}
+      pos = call.args
+      node_version = (
+          kw.get('node_version') or (pos[1] if len(pos) > 1 else None)
+      )
+      self.assertEqual('1.33', node_version)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_concurrent_nodepools=2,
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+      k8s_mgmt_pipeline_scenario_a=False,
+  )
+  def testPhaseByPhaseDeleteUsesLivePoolList(self):
+    """Tests that _RunScenarioA deletes only the pools it finds at runtime."""
+    cluster = _make_mock_cluster(pool_names=['pkbma000'])
+    kubernetes_management_benchmark._RunScenarioA(cluster, '1.33', '1.34')
+    self.assertEqual(1, cluster.DeleteNodePoolAsync.call_count)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_concurrent_nodepools=2,
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+      k8s_mgmt_pipeline_scenario_a=True,
+  )
+  def testPipelinedModeActivatedByFlag(self):
+    """Tests pipelined mode is activated by the pipeline_scenario_a flag."""
+    cluster = _make_mock_cluster(pool_names=[])
+    samples = kubernetes_management_benchmark._RunScenarioA(
+        cluster, '1.33', '1.34'
+    )
+    metrics = {s.metric for s in samples}
+    self.assertTrue(any('ScenarioA_Create' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Delete' in m for m in metrics))
+
+
+class RunScenarioAPipelinedTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _RunScenarioAPipelined pipelined execution path."""
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testPipelinedProducesAllThreePhases(self):
+    """Tests pipelined Scenario A produces Create/Upgrade/Delete samples."""
+    cluster = _make_mock_cluster(pool_names=[])
+    samples = kubernetes_management_benchmark._RunScenarioAPipelined(
+        cluster, n=2, initial='1.33', target='1.34'
+    )
+    metrics = {s.metric for s in samples}
+    self.assertTrue(any('ScenarioA_Create' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Upgrade' in m for m in metrics))
+    self.assertTrue(any('ScenarioA_Delete' in m for m in metrics))
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testPipelinedSkipsUpgradeAfterCreateFailure(self):
+    """Tests pipelined mode skips upgrade when create fails."""
+    cluster = _make_mock_cluster(pool_names=[])
+    cluster.CreateNodePoolAsync.side_effect = RuntimeError('create failed')
+    samples = kubernetes_management_benchmark._RunScenarioAPipelined(
+        cluster, n=1, initial='1.33', target='1.34'
+    )
+    cluster.UpgradeNodePoolAsync.assert_not_called()
+    upgrade_rate = next(
+        (s for s in samples if s.metric == 'ScenarioA_Upgrade_SuccessRate'),
+        None,
+    )
+    if upgrade_rate is not None:
+      self.assertEqual(0.0, upgrade_rate.value)
+
+
+class RunScenarioBTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _RunScenarioB cluster-update + nodepool-create scenario."""
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testProducesClusterUpdateAndNodePoolCreateSamples(self):
+    cluster = _make_mock_cluster(pool_names=[])
+    samples = kubernetes_management_benchmark._RunScenarioB(cluster, '1.33')
+    metrics = {s.metric for s in samples}
+    self.assertTrue(any('ScenarioB_ClusterUpdate' in m for m in metrics))
+    self.assertTrue(any('ScenarioB_NodePoolCreate' in m for m in metrics))
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testDeletesTestPoolAfterRun(self):
+    cluster = _make_mock_cluster(pool_names=[])
+    kubernetes_management_benchmark._RunScenarioB(cluster, '1.33')
+    cluster.DeleteNodePool.assert_called_once_with(
+        kubernetes_management_benchmark._SCENARIO_B_NAME
+    )
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testToleratesDeleteFailure(self):
+    cluster = _make_mock_cluster(pool_names=[])
+    cluster.DeleteNodePool.side_effect = RuntimeError('delete failed')
+    kubernetes_management_benchmark._RunScenarioB(cluster, '1.33')
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testPassesInitialVersionToCreate(self):
+    """Tests _RunScenarioB passes initial_version to CreateNodePoolAsync."""
+    cluster = _make_mock_cluster(pool_names=[])
+    kubernetes_management_benchmark._RunScenarioB(cluster, '1.33')
+    for call in cluster.CreateNodePoolAsync.call_args_list:
+      kw = call.kwargs if call.kwargs else {}
+      pos = call.args
+      node_version = (
+          kw.get('node_version') or (pos[1] if len(pos) > 1 else None)
+      )
+      self.assertEqual('1.33', node_version)
+
+
+class RunScenarioCTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the _RunScenarioC large-scale create-and-delete scenario."""
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testProducesCreateAndDeleteSamples(self):
+    cluster = _make_mock_cluster(pool_names=['pkbmc0000', 'pkbmc0001'])
+    samples = kubernetes_management_benchmark._RunScenarioC(
+        cluster, '1.33', scale=2
+    )
+    metrics = {s.metric for s in samples}
+    self.assertTrue(any('ScenarioC_Create' in m for m in metrics))
+    self.assertTrue(any('ScenarioC_Delete' in m for m in metrics))
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testZeroLivePoolsRecordsZeroDeleteSuccessRate(self):
+    """Tests Scenario C records 0% delete rate when no live pools exist."""
+    cluster = _make_mock_cluster(pool_names=[])
+    samples = kubernetes_management_benchmark._RunScenarioC(
+        cluster, '1.33', scale=3
+    )
+    delete_rate = next(
+        s for s in samples if s.metric == 'ScenarioC_Delete_SuccessRate'
+    )
+    self.assertEqual(0.0, delete_rate.value)
+    cluster.DeleteNodePoolAsync.assert_not_called()
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testDeleteUsesLiveListNotOriginalCreateList(self):
+    cluster = _make_mock_cluster(pool_names=['pkbmc0000', 'pkbmc0001'])
+    kubernetes_management_benchmark._RunScenarioC(cluster, '1.33', scale=3)
+    self.assertEqual(2, cluster.DeleteNodePoolAsync.call_count)
+
+  @flagsaver.flagsaver(
+      k8s_mgmt_nodes_per_nodepool=1,
+      k8s_mgmt_max_concurrent=50,
+  )
+  def testCreateSuccessRateUsesScaleAsDenominator(self):
+    """Tests Scenario C create success rate uses scale as total_ops."""
+    cluster = _make_mock_cluster(pool_names=['pkbmc0000'])
+    samples = kubernetes_management_benchmark._RunScenarioC(
+        cluster, '1.33', scale=3
+    )
+    create_rate = next(
+        s for s in samples if s.metric == 'ScenarioC_Create_SuccessRate'
+    )
+    self.assertLessEqual(create_rate.value, 100.0)
+    self.assertEqual('3', create_rate.metadata['total_ops'])
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tests/providers/aws/elastic_kubernetes_service_test.py b/tests/providers/aws/elastic_kubernetes_service_test.py
index 90d28eb834..bc8744d565 100644
--- a/tests/providers/aws/elastic_kubernetes_service_test.py
+++ b/tests/providers/aws/elastic_kubernetes_service_test.py
@@ -1,12 +1,16 @@
+"""Tests for the AWS Elastic Kubernetes Service provider."""
+# pylint: disable=invalid-name,protected-access
+
 import json
 import os
 import tempfile
 import unittest
 from unittest import mock
 from urllib import parse
-from absl.testing import flagsaver
-from absl.testing import parameterized
+from absl.testing import flagsaver  # pylint: disable=import-error
+from absl.testing import parameterized  # pylint: disable=import-error
 from perfkitbenchmarker import data
+from perfkitbenchmarker import errors
 from perfkitbenchmarker import network
 from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.configs import container_spec
@@ -34,6 +38,7 @@
 
 
 class BaseEksTest(pkb_common_test_case.PkbCommonTestCase):
+  """Base test class providing common EKS cluster setup and mock helpers."""
 
   def setUp(self):
     super().setUp()
@@ -80,11 +85,13 @@ def MockJsonRead(self, cluster: elastic_kubernetes_service.BaseEksCluster):
 
 
 class ElasticKubernetesServiceTest(BaseEksTest):
+  """Tests for the managed-nodegroup EksCluster provider."""
 
   def testInitEksClusterWorks(self):
     elastic_kubernetes_service.EksCluster(EKS_SPEC)
 
   def testEksClusterCreateRegion(self):
+    """EksCluster._Create() without explicit AZ omits availabilityZones."""
     self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]})
     spec = container_spec.ContainerClusterSpec(
         'NAME',
@@ -121,6 +128,9 @@ def testEksClusterCreateRegion(self):
     )
 
   def testEksClusterCreateZone(self):
+    """EksCluster._Create() with a zone issues the expected eksctl commands."""
+    ebs_policy = 'arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy'
+    ebs_role = 'arn:aws:iam::1234:role/AmazonEKS_EBS_CSI_DriverRole_pkb-123p'
     issue_command = self.MockIssueCommand(
         {'create cluster': [('Cluster created', '', 0)]}
     )
@@ -136,7 +146,7 @@ def testEksClusterCreateZone(self):
             '--namespace=kube-system',
             '--region=us-west-1',
             '--cluster=pkb-123p',
-            '--attach-policy-arn=arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy',
+            f'--attach-policy-arn={ebs_policy}',
             '--approve',
             '--role-only',
             '--role-name=AmazonEKS_EBS_CSI_DriverRole_pkb-123p',
@@ -148,7 +158,7 @@ def testEksClusterCreateZone(self):
             '--name=aws-ebs-csi-driver',
             '--region=us-west-1',
             '--cluster=pkb-123p',
-            '--service-account-role-arn=arn:aws:iam::1234:role/AmazonEKS_EBS_CSI_DriverRole_pkb-123p',
+            f'--service-account-role-arn={ebs_role}',
         ]),
     ])
     assert self.patched_read_json is not None
@@ -158,6 +168,7 @@ def testEksClusterCreateZone(self):
     )
 
   def testEksClusterNodepools(self):
+    """Additional nodepools appear in the managedNodeGroups config."""
     self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]})
     spec2 = EKS_SPEC_DICT.copy()
     spec2['nodepools'] = {
@@ -200,6 +211,7 @@ def testEksClusterNodepools(self):
     )
 
   def testEksClusterNodepoolsAutoscaling(self):
+    """Autoscaling min/max/desired values propagate to managedNodeGroups."""
     self.MockIssueCommand({'create cluster': [('Cluster created', '', 0)]})
     spec2 = EKS_SPEC_DICT.copy()
     spec2['min_vm_count'] = 1
@@ -236,6 +248,7 @@ def testEksClusterNodepoolsAutoscaling(self):
     self.assertEqual(node_groups[1]['desiredCapacity'], 3)
 
   def testGetNodePoolNames(self):
+    """GetNodePoolNames returns list of nodegroup names from eksctl output."""
     # Mock the output of the aws cli command
     cluster = elastic_kubernetes_service.EksCluster(EKS_SPEC)
 
@@ -255,6 +268,7 @@ def testGetNodePoolNames(self):
     )
 
   def testGetNodePoolNamesKarpenter(self):
+    """GetNodePoolNames on Karpenter cluster returns kubectl nodepool names."""
     cluster = elastic_kubernetes_service.EksKarpenterCluster(EKS_SPEC)
     self.MockIssueCommand({
         'kubectl --kubeconfig  get nodepool -o json': [(
@@ -275,6 +289,7 @@ def testGetNodePoolNamesKarpenter(self):
       ('standard nodepool', 'nginx', 'nginx'),
   )
   def testEksClusterGetNodepoolFromName(self, nodepool_name, expected_name):
+    """GetNodePoolFromNodeName resolves a node name to its nodepool."""
     self.MockIssueCommand({'get node': [(nodepool_name, '', 0)]})
     spec2 = EKS_SPEC_DICT.copy()
     spec2['nodepools'] = {
@@ -296,6 +311,7 @@ def testEksClusterGetNodepoolFromName(self, nodepool_name, expected_name):
     self.assertEqual(nodepool.name, expected_name)
 
   def testEksClusterNotFound(self):
+    """GetNodePoolFromNodeName returns None when node is not found."""
     self.MockIssueCommand({'get node': [('', '', 0)]})
     spec2 = EKS_SPEC_DICT.copy()
     spec2['nodepools'] = {
@@ -326,6 +342,7 @@ def testEksClusterGetMachineTypeFromNodeName(self):
 
 
 class EksAutoClusterTest(BaseEksTest):
+  """Tests for the auto-mode EksAutoCluster provider."""
 
   def testInitEksClusterWorks(self):
     elastic_kubernetes_service.EksAutoCluster(EKS_SPEC)
@@ -340,6 +357,7 @@ def testEksClusterCreate(self):
     self.assertEqual(called_json['autoModeConfig'], {'enabled': True})
 
   def testEksClusterIsReady(self):
+    """EksAutoCluster._IsReady() returns True when cluster-info succeeds."""
     self.enter_context(
         mock.patch.object(
             kubectl,
@@ -347,7 +365,8 @@ def testEksClusterIsReady(self):
             return_value=(
                 (
                     r'^[[0;32mKubernetes control plane^[[0m is running at'
-                    r' ^[[0;33mhttps://RAND1234.gr7.us-west-1.eks.amazonaws.com^[[0mTo'
+                    r' ^[[0;33mhttps://RAND1234.gr7.us-west-1.'
+                    r'eks.amazonaws.com^[[0mTo'
                     " further debug and diagnose cluster problems, use 'kubectl"
                     " cluster-info dump'."
                 ),
@@ -361,6 +380,7 @@ def testEksClusterIsReady(self):
 
 
 class EksKarpenterTest(BaseEksTest):
+  """Tests for the Karpenter-based EksKarpenterCluster provider."""
 
   def setUp(self):
     super().setUp()
@@ -380,6 +400,7 @@ def testInitEksClusterWorks(self):
 
   @flagsaver.flagsaver(kubeconfig='/tmp/kubeconfig')
   def testEksYamlCreateFull(self):
+    """EksKarpenterCluster._Create() produces the expected eksctl yaml."""
     cluster = elastic_kubernetes_service.EksKarpenterCluster(EKS_SPEC)
     self.MockJsonRead(cluster)
     mock_cmd = self.MockIssueCommand({
@@ -452,6 +473,7 @@ def testEksYamlCreateFull(self):
   )
   @flagsaver.flagsaver(kubeconfig='/tmp/kubeconfig')
   def testEksYamlCreateFullNodepools(self, nodepool_config, expected_content):
+    """EksKarpenterCluster._PostCreate() logs expected nodepool yaml."""
     # Mock resources for _PostCreate
     self.MockIssueCommand({
         'helm upgrade --install karpenter': [('', '', 0)],
@@ -515,28 +537,17 @@ def testRecursiveDictionaryUpdate(self):
     expected = {'a': 3, 'deep': {'c': 2, 'd': 4}, 'f': 12}
     self.assertEqual(
         expected,
-        elastic_kubernetes_service.RecursivelyUpdateDictionary(base, update),
+        elastic_kubernetes_service._recursively_update_dictionary(base, update),
     )
 
   def testIngressAddressParsing(self):
     """Test parsing AWS ALB address with dualstack prefix removal."""
+    elb_host = 'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com'
     test_cases = [
-        (
-            'http://dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-            'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-        ),
-        (
-            'https://dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-            'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-        ),
-        (
-            'dualstack.k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-            'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-        ),
-        (
-            'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-            'k8s-test-ingress-abc12345ef-123456789.us-east-1.elb.amazonaws.com',
-        ),
+        (f'http://dualstack.{elb_host}', elb_host),
+        (f'https://dualstack.{elb_host}', elb_host),
+        (f'dualstack.{elb_host}', elb_host),
+        (elb_host, elb_host),
     ]
     for address, expected in test_cases:
       with self.subTest(address=address):
@@ -549,5 +560,289 @@ def testIngressAddressParsing(self):
         self.assertEqual(normalized, expected)
 
 
+class EksManagementPlaneTest(BaseEksTest):
+  """Tests for EKS management-plane methods (k8s_management_benchmark)."""
+
+  def _make_cluster(self, spec_dict=None):
+    spec = container_spec.ContainerClusterSpec(
+        'NAME',
+        **(spec_dict or EKS_SPEC_DICT),
+    )
+    cluster = elastic_kubernetes_service.EksCluster(spec)
+    self.MockJsonRead(cluster)
+    # Individual tests override via MockIssueCommand.
+    return cluster
+
+  def _make_nodepool_config(self, name='pkbpool0', machine_type='m5.large',
+                             num_nodes=2):
+    cfg = mock.MagicMock()
+    cfg.name = name
+    cfg.num_nodes = num_nodes
+    cfg.machine_type = machine_type
+    return cfg
+
+  # ---- CreateNodePoolAsync --------------------------------------------------
+
+  def testCreateNodePoolAsyncIssuesCreateNodegroup(self):
+    """CreateNodePoolAsync calls create-nodegroup; returns ng_active handle."""
+    cluster = self._make_cluster()
+    # Subnets / AZ discovery stubs
+    cluster._cached_subnets = ['subnet-1']
+    cluster._cached_subnets_per_az = {}
+    cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole'
+    self.MockIssueCommand({'create-nodegroup': [('', '', 0)]})
+
+    handle = cluster.CreateNodePoolAsync(self._make_nodepool_config('poolA'))
+
+    self.assertEqual('ng_active:poolA', handle)
+    # Verify the json file path was written
+    self.assertIsNotNone(self.patched_read_json)
+
+  def testCreateNodePoolAsyncReturnsNgActiveHandle(self):
+    """CreateNodePoolAsync returns 'ng_active:<name>' on success."""
+    cluster = self._make_cluster()
+    cluster._cached_subnets = ['subnet-1']
+    cluster._cached_subnets_per_az = {}
+    cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole'
+    self.MockIssueCommand({'': [('', '', 0)]})
+
+    handle = cluster.CreateNodePoolAsync(self._make_nodepool_config('myng'))
+    self.assertEqual('ng_active:myng', handle)
+
+  def testCreateNodePoolAsyncRaisesOnFailure(self):
+    """CreateNodePoolAsync raises CreationError when the CLI fails."""
+    cluster = self._make_cluster()
+    cluster._cached_subnets = ['subnet-1']
+    cluster._cached_subnets_per_az = {}
+    cluster._cached_node_role_arn = 'arn:aws:iam::1234:role/NodeRole'
+    self.MockIssueCommand({'': [('', 'error msg', 1)]})
+
+    with self.assertRaises(Exception):
+      cluster.CreateNodePoolAsync(self._make_nodepool_config('failng'))
+
+  # ---- UpgradeNodePoolAsync -------------------------------------------------
+
+  def testUpgradeNodePoolAsyncReturnsNgActiveHandle(self):
+    """UpgradeNodePoolAsync calls update-nodegroup-version; returns handle."""
+    cluster = self._make_cluster()
+    mock_cmd = self.MockIssueCommand(
+        {'update-nodegroup-version': [('', '', 0)]}
+    )
+    handle = cluster.UpgradeNodePoolAsync('my-ng', '1.34')
+
+    self.assertEqual('ng_active:my-ng', handle)
+    self.assertIn('update-nodegroup-version', mock_cmd.all_commands)
+    self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands)
+
+  def testUpgradeNodePoolAsyncRaisesOnFailure(self):
+    """UpgradeNodePoolAsync raises on non-zero exit code."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({'': [('', 'oops', 1)]})
+    with self.assertRaises(Exception):
+      cluster.UpgradeNodePoolAsync('bad-ng', '1.34')
+
+  # ---- DeleteNodePoolAsync --------------------------------------------------
+
+  def testDeleteNodePoolAsyncReturnsNgGoneHandle(self):
+    """DeleteNodePoolAsync calls delete-nodegroup, returns ng_gone handle."""
+    cluster = self._make_cluster()
+    mock_cmd = self.MockIssueCommand({'delete-nodegroup': [('', '', 0)]})
+    handle = cluster.DeleteNodePoolAsync('old-ng')
+
+    self.assertEqual('ng_gone:old-ng', handle)
+    self.assertIn('delete-nodegroup', mock_cmd.all_commands)
+    self.assertIn('--nodegroup-name old-ng', mock_cmd.all_commands)
+
+  # ---- UpdateClusterAsync ---------------------------------------------------
+
+  def testUpdateClusterAsyncReturnsClusterUpdateHandle(self):
+    """UpdateClusterAsync returns 'cluster_update:<update_id>'."""
+    cluster = self._make_cluster()
+    describe_out = json.dumps({
+        'cluster': {'logging': {'clusterLogging': []}}
+    })
+    update_out = json.dumps({'update': {'id': 'u-abc123'}})
+    self.MockIssueCommand({
+        'describe-cluster': [(describe_out, '', 0)],
+        'update-cluster-config': [(update_out, '', 0)],
+    })
+    handle = cluster.UpdateClusterAsync()
+    self.assertEqual('cluster_update:u-abc123', handle)
+
+  def testUpdateClusterAsyncTogglesLogging(self):
+    """UpdateClusterAsync toggles logging enable state."""
+    cluster = self._make_cluster()
+    # Current state: logging disabled
+    describe_out = json.dumps({
+        'cluster': {'logging': {'clusterLogging': [{'enabled': False}]}}
+    })
+    update_out = json.dumps({'update': {'id': 'u-xyz'}})
+    mock_cmd = self.MockIssueCommand({
+        'describe-cluster': [(describe_out, '', 0)],
+        'update-cluster-config': [(update_out, '', 0)],
+    })
+    cluster.UpdateClusterAsync()
+    self.assertIn('update-cluster-config', mock_cmd.all_commands)
+    self.assertIn('--logging', mock_cmd.all_commands)
+
+  # ---- WaitForOperation -----------------------------------------------------
+
+  def testWaitForOperationNgActiveSuccess(self):
+    """WaitForOperation(ng_active:name) returns when nodegroup is ACTIVE."""
+    cluster = self._make_cluster()
+    ng_out = json.dumps({'nodegroup': {'status': 'ACTIVE'}})
+    self.MockIssueCommand({'describe-nodegroup': [(ng_out, '', 0)]})
+    # Should not raise
+    cluster.WaitForOperation('ng_active:my-ng')
+
+  def testWaitForOperationNgActiveFailedRaises(self):
+    """WaitForOperation raises CreationError on CREATE_FAILED nodegroup."""
+    cluster = self._make_cluster()
+    ng_out = json.dumps({'nodegroup': {'status': 'CREATE_FAILED'}})
+    self.MockIssueCommand({'describe-nodegroup': [(ng_out, '', 0)]})
+    with self.assertRaises(Exception):
+      cluster.WaitForOperation('ng_active:bad-ng')
+
+  def testWaitForOperationNgGoneSuccess(self):
+    """WaitForOperation(ng_gone:name) returns on ResourceNotFoundException."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({
+        'describe-nodegroup': [('', 'ResourceNotFoundException', 1)]
+    })
+    # Should not raise
+    cluster.WaitForOperation('ng_gone:deleted-ng')
+
+  def testWaitForOperationClusterUpdateSuccess(self):
+    """WaitForOperation(cluster_update:id) returns when update is Successful."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({'describe-update': [('Successful\n', '', 0)]})
+    # Should not raise
+    cluster.WaitForOperation('cluster_update:u-999')
+
+  def testWaitForOperationClusterUpdateFailedRaises(self):
+    """WaitForOperation raises when cluster update ends in Failed."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({'describe-update': [('Failed\n', '', 0)]})
+    with self.assertRaises(Exception):
+      cluster.WaitForOperation('cluster_update:u-fail')
+
+  def testWaitForOperationUnknownHandleRaises(self):
+    """WaitForOperation raises ValueError for unknown handle prefix."""
+    cluster = self._make_cluster()
+    with self.assertRaises(ValueError):
+      cluster.WaitForOperation('unknown_handle:xyz')
+
+  # ---- ResolveNodePoolVersions ----------------------------------------------
+
+  def testResolveNodePoolVersionsNMinus1Math(self):
+    """ResolveNodePoolVersions returns (N-1, N) from cluster_version."""
+    cluster = self._make_cluster()
+    cluster.cluster_version = '1.34'
+    initial, target = cluster.ResolveNodePoolVersions()
+    self.assertEqual('1.33', initial)
+    self.assertEqual('1.34', target)
+
+  def testResolveNodePoolVersionsStripsMinorPatch(self):
+    """ResolveNodePoolVersions strips patch from version strings."""
+    cluster = self._make_cluster()
+    cluster.cluster_version = '1.33.7'
+    initial, target = cluster.ResolveNodePoolVersions()
+    self.assertEqual('1.32', initial)
+    self.assertEqual('1.33', target)
+
+  # ---- _DiscoverSubnets -----------------------------------------------------
+
+  def testDiscoverSubnets(self):
+    """_DiscoverSubnets returns subnet IDs from describe-cluster."""
+    cluster = self._make_cluster()
+    describe_out = json.dumps({
+        'cluster': {
+            'resourcesVpcConfig': {
+                'subnetIds': ['subnet-aaa', 'subnet-bbb']
+            }
+        }
+    })
+    self.MockIssueCommand({'describe-cluster': [(describe_out, '', 0)]})
+    subnets = cluster._DiscoverSubnets()
+    self.assertEqual(['subnet-aaa', 'subnet-bbb'], subnets)
+
+  def testDiscoverSubnetsCached(self):
+    """_DiscoverSubnets uses cached result on second call."""
+    cluster = self._make_cluster()
+    cluster._cached_subnets = ['subnet-cached']
+    # No IssueCommand calls expected because cache is used
+    with mock.patch.object(vm_util, 'IssueCommand') as mock_issue:
+      result = cluster._DiscoverSubnets()
+    mock_issue.assert_not_called()
+    self.assertEqual(['subnet-cached'], result)
+
+  # ---- _DiscoverSubnetsPerAZ ------------------------------------------------
+
+  def testDiscoverSubnetsPerAZBuildsAzMap(self):
+    """_DiscoverSubnetsPerAZ builds a {AZ: subnet_id} map from EC2."""
+    cluster = self._make_cluster()
+    cluster._cached_subnets = ['subnet-a1', 'subnet-b2']
+    subnets_out = json.dumps([
+        {'SubnetId': 'subnet-a1', 'AZ': 'us-west-1a'},
+        {'SubnetId': 'subnet-b2', 'AZ': 'us-west-1b'},
+    ])
+    self.MockIssueCommand({'describe-subnets': [(subnets_out, '', 0)]})
+    az_map = cluster._DiscoverSubnetsPerAZ()
+    self.assertEqual({'us-west-1a': 'subnet-a1', 'us-west-1b': 'subnet-b2'},
+                     az_map)
+
+  # ---- _DiscoverNodeRoleArn -------------------------------------------------
+
+  def testDiscoverNodeRoleArn(self):
+    """_DiscoverNodeRoleArn returns role ARN from the first nodegroup."""
+    cluster = self._make_cluster()
+    list_out = json.dumps({'nodegroups': ['ng1']})
+    describe_out = json.dumps({
+        'nodegroup': {'nodeRole': 'arn:aws:iam::1234:role/MyRole'}
+    })
+    self.MockIssueCommand({
+        'list-nodegroups': [(list_out, '', 0)],
+        'describe-nodegroup': [(describe_out, '', 0)],
+    })
+    arn = cluster._DiscoverNodeRoleArn()
+    self.assertEqual('arn:aws:iam::1234:role/MyRole', arn)
+
+  def testDiscoverNodeRoleArnRaisesWhenNoNodegroup(self):
+    """_DiscoverNodeRoleArn raises CreationError when no nodegroups found."""
+    cluster = self._make_cluster()
+    list_out = json.dumps({'nodegroups': []})
+    self.MockIssueCommand({'list-nodegroups': [(list_out, '', 0)]})
+    with self.assertRaises(errors.Resource.CreationError):
+      cluster._DiscoverNodeRoleArn()
+
+  # ---- _ResolveReleaseVersion -----------------------------------------------
+
+  def testResolveReleaseVersion(self):
+    """_ResolveReleaseVersion returns the SSM parameter value."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({
+        'get-parameter': [('1.33.10-20260101\n', '', 0)]
+    })
+    version = cluster._ResolveReleaseVersion('1.33')
+    self.assertEqual('1.33.10-20260101', version)
+
+  def testResolveReleaseVersionCached(self):
+    """_ResolveReleaseVersion uses cache for repeated calls."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({
+        'get-parameter': [('1.34.2-20260101\n', '', 0)]
+    })
+    v1 = cluster._ResolveReleaseVersion('1.34')
+    v2 = cluster._ResolveReleaseVersion('1.34')
+    self.assertEqual(v1, v2)
+
+  def testResolveReleaseVersionRaisesOnFailure(self):
+    """_ResolveReleaseVersion raises CreationError when SSM lookup fails."""
+    cluster = self._make_cluster()
+    self.MockIssueCommand({'get-parameter': [('', 'not found', 1)]})
+    with self.assertRaises(errors.Resource.CreationError):
+      cluster._ResolveReleaseVersion('1.99')
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tests/providers/azure/azure_kubernetes_service_test.py b/tests/providers/azure/azure_kubernetes_service_test.py
index 7ca09fb29c..3f6334e998 100644
--- a/tests/providers/azure/azure_kubernetes_service_test.py
+++ b/tests/providers/azure/azure_kubernetes_service_test.py
@@ -1,3 +1,6 @@
+"""Tests for the Azure Kubernetes Service provider."""
+# pylint: disable=invalid-name,protected-access
+
 import unittest
 from unittest import mock
 from absl.testing import flagsaver
@@ -7,12 +10,14 @@
 from perfkitbenchmarker.providers.azure import azure_kubernetes_service
 from perfkitbenchmarker.providers.azure import azure_network
 from perfkitbenchmarker.providers.azure import util
-from tests import pkb_common_test_case
+from tests import pkb_common_test_case  # pylint: disable=no-name-in-module
 
 
 class AzureKubernetesServiceTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the AksCluster provider."""
 
   def setUp(self):
+    """Sets up mocks and creates a default AksCluster for each test."""
     super().setUp()
     self.enter_context(
         mock.patch.object(
@@ -70,6 +75,7 @@ def initAksCluster(self, spec_dict):
     self.aks.resource_group.args = []
 
   def testCreate(self):
+    """AksCluster.Create() issues the expected az aks create command."""
     mock_cmd = self.MockIssueCommand(
         {
             'az aks create': [('', '', 0)],
@@ -119,6 +125,7 @@ def testCreate(self):
     )
 
   def testCreateError(self):
+    """AksCluster.Create() raises CreationError when az aks create fails."""
     self.MockIssueCommand(
         {
             'az aks create': [('out', 'Error could not create', 1)],
@@ -141,6 +148,7 @@ def testCreateError(self):
       self.aks.Create()
 
   def testCreateNodepool(self):
+    """Additional nodepools appear in az aks nodepool add commands."""
     mock_cmd = self.MockIssueCommand(
         {
             'az aks create': [('', '', 0)],
@@ -207,6 +215,7 @@ def testCreateAutoscaler(self):
     )
 
   def testCreateAutoscaler_NodepoolAndClamps(self):
+    """Autoscaler min/max/desired values propagate to nodepool add commands."""
     mock_cmd = self.MockIssueCommand(
         {
             'az aks create': [('', '', 0)],
@@ -229,12 +238,13 @@ def testCreateAutoscaler_NodepoolAndClamps(self):
     self.aks._Create()
     self.assertIn(
         '--enable-cluster-autoscaler --min-count=4 --max-count=6'
-        ' --node-count=4',
+        + ' --node-count=4',
         mock_cmd.all_commands,
     )
 
   @flagsaver.flagsaver(kubectl='kubectl', kubeconfig='dummy')
   def testFullCreateAksAutomatic(self):
+    """AksAutomaticCluster.Create() issues RBAC and policy assignment cmds."""
     aks_auto = azure_kubernetes_service.AksAutomaticCluster(self.spec)
     aks_auto.resource_group.name = 'resource-group'
     mock_cmd = self.MockIssueCommand(
@@ -258,7 +268,8 @@ def testFullCreateAksAutomatic(self):
                 ('servicePrincipal', '', 0),
                 ('user-name', '', 0),
                 (
-                    'test-user@example.com\n12345678-1234-1234-1234-123456789abc',
+                    'test-user@example.com\n'
+                    + '12345678-1234-1234-1234-123456789abc',
                     '',
                     0,
                 ),
@@ -272,7 +283,7 @@ def testFullCreateAksAutomatic(self):
     aks_auto.Create()
     self.assertIn(
         'az role assignment create --assignee user-name --role Azure Kubernetes'
-        ' Service RBAC Admin',
+        + ' Service RBAC Admin',
         mock_cmd.all_commands,
     )
     self.assertIn(
@@ -281,11 +292,12 @@ def testFullCreateAksAutomatic(self):
     )
     self.assertIn(
         'az policy assignment update --name'
-        ' aks-deployment-safeguards-policy-assignment',
+        + ' aks-deployment-safeguards-policy-assignment',
         mock_cmd.all_commands,
     )
 
   def testGetNodePoolNames(self):
+    """GetNodePoolNames returns pool names from az aks nodepool list output."""
     self.MockIssueCommand(
         {
             'az aks nodepool list': [(
@@ -308,5 +320,238 @@ def testGetNodePoolNames(self):
     self.assertEqual(self.aks.GetNodePoolNames(), ['default', 'nodepool1'])
 
 
+class AksManagementPlaneTest(AzureKubernetesServiceTest):
+  """Tests for AKS management-plane methods (k8s_management_benchmark)."""
+
+  # These tests are inherited from AzureKubernetesServiceTest but are not
+  # relevant to the management-plane test suite. Override to skip them so
+  # they don't pollute the AksManagementPlaneTest results.
+  def testCreate(self):
+    pass
+
+  def testCreateError(self):
+    pass
+
+  def _make_nodepool_config(self, name='pkbpool0',
+                             machine_type='Standard_D2s_v5',
+                             num_nodes=2):
+    cfg = mock.MagicMock()
+    cfg.name = name
+    cfg.num_nodes = num_nodes
+    cfg.machine_type = machine_type
+    cfg.min_nodes = num_nodes
+    cfg.max_nodes = num_nodes
+    cfg.disk_size = 100
+    return cfg
+
+  # ---- CreateNodePool -------------------------------------------------------
+
+  def testCreateNodePool(self):
+    """CreateNodePool issues 'az aks nodepool add' with cluster-name."""
+    mock_cmd = self.MockIssueCommand({'az aks nodepool add': [('', '', 0)]})
+    self.aks.CreateNodePool(self._make_nodepool_config('testpool'))
+
+    self.assertIn('az aks nodepool add', mock_cmd.all_commands)
+    self.assertIn('--cluster-name', mock_cmd.all_commands)
+    self.assertIn('--labels', mock_cmd.all_commands)
+
+  def testCreateNodePoolWithVersion(self):
+    """CreateNodePool passes --kubernetes-version when node_version is set."""
+    self.aks.cluster_version = '1.33'
+    mock_cmd = self.MockIssueCommand({'az aks nodepool add': [('', '', 0)]})
+    self.aks.CreateNodePool(
+        self._make_nodepool_config('verpool'), node_version='1.32'
+    )
+    self.assertIn('--kubernetes-version 1.32', mock_cmd.all_commands)
+
+  def testCreateNodePoolRaisesOnFailure(self):
+    """CreateNodePool raises CreationError when CLI fails."""
+    self.MockIssueCommand({'az aks nodepool add': [('', 'error', 1)]})
+    with self.assertRaises(errors.Resource.CreationError):
+      self.aks.CreateNodePool(self._make_nodepool_config('failpool'))
+
+  # ---- DeleteNodePool -------------------------------------------------------
+
+  def testDeleteNodePool(self):
+    """DeleteNodePool issues 'az aks nodepool delete' with cluster-name."""
+    mock_cmd = self.MockIssueCommand(
+        {'az aks nodepool delete': [('', '', 0)]}
+    )
+    self.aks.DeleteNodePool('old-pool')
+
+    self.assertIn('az aks nodepool delete', mock_cmd.all_commands)
+    self.assertIn('--cluster-name', mock_cmd.all_commands)
+
+  # ---- UpgradeNodePool ------------------------------------------------------
+
+  def testUpgradeNodePool(self):
+    """UpgradeNodePool issues 'az aks nodepool upgrade' with version."""
+    mock_cmd = self.MockIssueCommand(
+        {'az aks nodepool upgrade': [('', '', 0)]}
+    )
+    self.aks.UpgradeNodePool('my-pool', '1.34')
+
+    self.assertIn('az aks nodepool upgrade', mock_cmd.all_commands)
+    self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands)
+
+  # ---- UpdateCluster --------------------------------------------------------
+
+  def testUpdateCluster(self):
+    """UpdateCluster issues 'az aks update' with a timestamp tag."""
+    mock_cmd = self.MockIssueCommand({'az aks update': [('', '', 0)]})
+    self.aks.UpdateCluster()
+
+    self.assertIn('az aks update', mock_cmd.all_commands)
+    self.assertIn('--tags', mock_cmd.all_commands)
+    self.assertIn('k8s-mgmt-ts=', mock_cmd.all_commands)
+
+  # ---- CreateNodePoolAsync --------------------------------------------------
+
+  def testCreateNodePoolAsyncReturnsNpSucceededHandle(self):
+    """CreateNodePoolAsync issues nodepool add with --no-wait."""
+    mock_cmd = self.MockIssueCommand(
+        {'az aks nodepool add': [('', '', 0)]}
+    )
+    handle = self.aks.CreateNodePoolAsync(self._make_nodepool_config('apool'))
+
+    self.assertIn('--no-wait', mock_cmd.all_commands)
+    self.assertTrue(handle.startswith('np_succeeded:'))
+
+  def testCreateNodePoolAsyncRaisesOnFailure(self):
+    """CreateNodePoolAsync raises CreationError on CLI failure."""
+    self.MockIssueCommand({'az aks nodepool add': [('', 'err', 1)]})
+    with self.assertRaises(errors.Resource.CreationError):
+      self.aks.CreateNodePoolAsync(self._make_nodepool_config('failpool'))
+
+  # ---- UpgradeNodePoolAsync -------------------------------------------------
+
+  def testUpgradeNodePoolAsyncReturnsNpSucceededHandle(self):
+    """UpgradeNodePoolAsync issues upgrade with --no-wait."""
+    mock_cmd = self.MockIssueCommand(
+        {'az aks nodepool upgrade': [('', '', 0)]}
+    )
+    handle = self.aks.UpgradeNodePoolAsync('my-pool', '1.34')
+
+    self.assertIn('--no-wait', mock_cmd.all_commands)
+    self.assertTrue(handle.startswith('np_succeeded:'))
+    self.assertIn('--kubernetes-version 1.34', mock_cmd.all_commands)
+
+  # ---- DeleteNodePoolAsync --------------------------------------------------
+
+  def testDeleteNodePoolAsyncReturnsNpGoneHandle(self):
+    """DeleteNodePoolAsync issues delete with --no-wait."""
+    mock_cmd = self.MockIssueCommand(
+        {'az aks nodepool delete': [('', '', 0)]}
+    )
+    handle = self.aks.DeleteNodePoolAsync('rm-pool')
+
+    self.assertIn('--no-wait', mock_cmd.all_commands)
+    self.assertTrue(handle.startswith('np_gone:'))
+
+  # ---- UpdateClusterAsync ---------------------------------------------------
+
+  def testUpdateClusterAsyncScalesSystemPool(self):
+    """UpdateClusterAsync scales the system pool; returns cluster_succeeded."""
+    pools_json = '[{"name": "nodepool1", "count": 2}]'
+    self.MockIssueCommand({
+        'az aks nodepool list': [(pools_json, '', 0)],
+        'az aks nodepool scale': [('', '', 0)],
+    })
+    handle = self.aks.UpdateClusterAsync()
+    self.assertEqual('cluster_succeeded', handle)
+
+  def testUpdateClusterAsyncFallbackTagUpdate(self):
+    """UpdateClusterAsync falls back to tag update when nodepool list fails."""
+    self.MockIssueCommand({
+        'az aks nodepool list': [('', 'err', 1)],
+        'az aks update': [('', '', 0)],
+    })
+    handle = self.aks.UpdateClusterAsync()
+    self.assertEqual('cluster_succeeded', handle)
+
+  # ---- WaitForOperation -----------------------------------------------------
+
+  def testWaitForOperationNpSucceeded(self):
+    """WaitForOperation(np_succeeded:name) returns on Succeeded state."""
+    self.MockIssueCommand(
+        {'az aks nodepool show': [('Succeeded\n', '', 0)]}
+    )
+    # Should not raise
+    self.aks.WaitForOperation('np_succeeded:mypool')
+
+  def testWaitForOperationNpSucceededFailedRaises(self):
+    """WaitForOperation raises CreationError on Failed provisioningState."""
+    self.MockIssueCommand(
+        {'az aks nodepool show': [('Failed\n', '', 0)]}
+    )
+    with self.assertRaises(errors.Resource.CreationError):
+      self.aks.WaitForOperation('np_succeeded:failpool')
+
+  def testWaitForOperationNpGone(self):
+    """WaitForOperation(np_gone:name) returns when nodepool is not found."""
+    self.MockIssueCommand({
+        'az aks nodepool show': [('', 'NotFound', 1)]
+    })
+    # Should not raise
+    self.aks.WaitForOperation('np_gone:deleted-pool')
+
+  def testWaitForOperationClusterSucceeded(self):
+    """WaitForOperation(cluster_succeeded) returns on Succeeded state."""
+    self.MockIssueCommand({
+        'az aks show': [('Succeeded\n', '', 0)]
+    })
+    # Should not raise
+    self.aks.WaitForOperation('cluster_succeeded')
+
+  def testWaitForOperationClusterSucceededFailedRaises(self):
+    """WaitForOperation raises CreationError when cluster update is Failed."""
+    self.MockIssueCommand({
+        'az aks show': [('Failed\n', '', 0)]
+    })
+    with self.assertRaises(errors.Resource.CreationError):
+      self.aks.WaitForOperation('cluster_succeeded')
+
+  def testWaitForOperationUnknownHandleRaises(self):
+    """WaitForOperation raises ValueError for an unknown handle prefix."""
+    with self.assertRaises(ValueError):
+      self.aks.WaitForOperation('bad_handle:something')
+
+  # ---- ResolveNodePoolVersions ----------------------------------------------
+
+  def testResolveNodePoolVersionsNMinus1Math(self):
+    """ResolveNodePoolVersions returns (N-1, N) from cluster_version."""
+    self.aks.cluster_version = '1.34'
+    initial, target = self.aks.ResolveNodePoolVersions()
+    self.assertEqual('1.33', initial)
+    self.assertEqual('1.34', target)
+
+  def testResolveNodePoolVersionsStripsMinorPatch(self):
+    """ResolveNodePoolVersions strips patch from full version string."""
+    self.aks.cluster_version = '1.33.5'
+    initial, target = self.aks.ResolveNodePoolVersions()
+    self.assertEqual('1.32', initial)
+    self.assertEqual('1.33', target)
+
+  # ---- _GetNodeFlags with version_override ----------------------------------
+
+  def testGetNodeFlagsVersionOverride(self):
+    """_GetNodeFlags uses version_override instead of cluster_version."""
+    self.aks.cluster_version = '1.34'
+    cfg = self._make_nodepool_config()
+    flags = self.aks._GetNodeFlags(cfg, version_override='1.33')
+    self.assertIn('--kubernetes-version', flags)
+    idx = flags.index('--kubernetes-version')
+    self.assertEqual('1.33', flags[idx + 1])
+
+  def testGetNodeFlagsUsesClusterVersionWhenNoOverride(self):
+    """_GetNodeFlags uses cluster_version when version_override is None."""
+    self.aks.cluster_version = '1.34'
+    cfg = self._make_nodepool_config()
+    flags = self.aks._GetNodeFlags(cfg, version_override=None)
+    self.assertIn('--kubernetes-version', flags)
+    idx = flags.index('--kubernetes-version')
+    self.assertEqual('1.34', flags[idx + 1])
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tests/providers/gcp/google_kubernetes_engine_test.py b/tests/providers/gcp/google_kubernetes_engine_test.py
index dbf8232f5e..d49ac77d2a 100644
--- a/tests/providers/gcp/google_kubernetes_engine_test.py
+++ b/tests/providers/gcp/google_kubernetes_engine_test.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 """Tests for perfkitbenchmarker.providers.gcp.google_kubernetes_engine."""
 
-# pylint: disable=not-context-manager
+# pylint: disable=not-context-manager,invalid-name,protected-access
 
 import builtins
 import contextlib
+import json
 import os
 import tempfile
 import unittest
@@ -61,6 +62,7 @@ class PatchedObjectsTestCase(pkb_common_test_case.PkbCommonTestCase):
   def patch_critical_objects(
       self, stdout='', stderr='', return_code=0, flags=FLAGS
   ):
+    """Patches common objects and yields a mock IssueCommand."""
     with contextlib.ExitStack() as stack:
       flags.gcloud_path = 'gcloud'
       flags.run_uri = _RUN_URI
@@ -99,10 +101,12 @@ def patch_critical_objects(
 
 
 class GoogleContainerRegistryTestCase(PatchedObjectsTestCase):
+  """Tests for the GoogleArtifactRegistry container registry."""
 
   class FakeContainerImage(container.ContainerImage):
+    """Minimal ContainerImage stub for registry tests."""
 
-    def __init__(self, name, directory=None):
+    def __init__(self, name, directory=None):  # pylint: disable=super-init-not-called
       self.name = name
       self.directory = directory or f'docker/{name}/Dockerfile'
 
@@ -117,6 +121,7 @@ def setUp(self):
     )
 
   def testFullRegistryTag(self):
+    """Tests that full registry tag is constructed correctly."""
     spec = container_spec.ContainerRegistrySpec(
         'NAME',
         **{
@@ -132,6 +137,7 @@ def testFullRegistryTag(self):
     )
 
   def testRemoteBuildCreateSucceeds(self):
+    """Tests that _Build succeeds when gcloud Issue returns success."""
     spec = container_spec.ContainerRegistrySpec(
         'NAME',
         **{
@@ -147,9 +153,11 @@ def testRemoteBuildCreateSucceeds(self):
 
 
 class GoogleKubernetesEngineCustomMachineTypeTestCase(PatchedObjectsTestCase):
+  """Tests for GKE cluster creation with a custom machine type."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE spec with a custom CPU/memory machine type."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -180,9 +188,11 @@ def testCreate(self):
 
 
 class GoogleKubernetesEngineTestCase(PatchedObjectsTestCase):
+  """Tests for standard GKE cluster create/delete/exists operations."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a standard GKE cluster spec with typical VM options."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -204,6 +214,7 @@ def create_kubernetes_engine_spec():
     return kubernetes_engine_spec
 
   def testCreate(self):
+    """Tests that _Create issues the correct gcloud command with all flags."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -242,6 +253,7 @@ def testCreateQuotaExceeded(self):
         cluster._Create()
 
   def testCreateResourcesExhausted(self):
+    """Tests _Create raises InsufficientCapacityCloudFailure on exhaustion."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects(
         stderr="""
@@ -258,6 +270,7 @@ def testCreateResourcesExhausted(self):
         cluster._Create()
 
   def testGetCredentials(self):
+    """Tests that _PostCreate issues get-credentials with KUBECONFIG set."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects() as issue_command, mock.patch.object(
         kubectl, 'RunKubectlCommand'
@@ -266,7 +279,7 @@ def testGetCredentials(self):
       cluster._Create()
       cluster._PostCreate()
       self.assertIn(
-          'gcloud container clusters get-credentials pkb-{}'.format(_RUN_URI),
+          f'gcloud container clusters get-credentials pkb-{_RUN_URI}',
           issue_command.all_commands,
       )
       self.assertIn(
@@ -282,7 +295,7 @@ def testDelete(self):
       cluster._Delete()
       self.assertEqual(issue_command.func_to_mock.call_count, 5)
       self.assertIn(
-          'gcloud container clusters delete pkb-{}'.format(_RUN_URI),
+          f'gcloud container clusters delete pkb-{_RUN_URI}',
           issue_command.all_commands,
       )
       self.assertIn('--zone us-central1-a', issue_command.all_commands)
@@ -293,11 +306,12 @@ def testExists(self):
       cluster = google_kubernetes_engine.GkeCluster(spec)
       cluster._Exists()
       self.assertIn(
-          'gcloud container clusters describe pkb-{}'.format(_RUN_URI),
+          f'gcloud container clusters describe pkb-{_RUN_URI}',
           issue_command.all_commands,
       )
 
   def testGetResourceMetadata(self):
+    """Tests that GetResourceMetadata returns all expected fields."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects(stdout=_KUBECTL_VERSION):
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -328,9 +342,11 @@ def testCidrCalculations(self):
 
 
 class GoogleKubernetesEngineAutoscalingTestCase(PatchedObjectsTestCase):
+  """Tests for GKE cluster creation with cluster-level autoscaling."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE spec with cluster-level autoscaling enabled."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -350,6 +366,7 @@ def create_kubernetes_engine_spec():
     return kubernetes_engine_spec
 
   def testCreate(self):
+    """Tests that _Create passes autoscaling flags to gcloud."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -364,6 +381,7 @@ def testCreate(self):
       self.assertIn('--cluster-ipv4-cidr /18', issue_command.all_commands)
 
   def testGetResourceMetadata(self):
+    """Tests that metadata includes autoscaling size fields."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects(stdout=_KUBECTL_VERSION):
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -393,9 +411,11 @@ def testLabelDisks(self):
 
 
 class GoogleKubernetesEngineVersionFlagTestCase(PatchedObjectsTestCase):
+  """Tests for GKE cluster creation with version and release-channel flags."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE spec for testing version and release-channel flags."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -443,9 +463,11 @@ def testCreateRapidChannel(self):
 
 
 class GoogleKubernetesEngineGvnicFlagTestCase(PatchedObjectsTestCase):
+  """Tests for GKE cluster creation with gVNIC enable/disable flags."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE spec for testing the gVNIC flag."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -478,9 +500,11 @@ def testCreateDisableGvnic(self):
 
 
 class GoogleKubernetesEngineWithGpusTestCase(PatchedObjectsTestCase):
+  """Tests for GKE cluster creation with GPU accelerator configuration."""
 
   @staticmethod
   def create_kubernetes_engine_spec(gpu_type):
+    """Creates a GKE spec with the given GPU type and 2 GPUs."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -501,6 +525,7 @@ def create_kubernetes_engine_spec(gpu_type):
 
   @flagsaver.flagsaver(gke_gpu_driver_version='latest')
   def testCreate(self):
+    """Tests that _Create includes the correct --accelerator flag for K80."""
     spec = self.create_kubernetes_engine_spec('k80')
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -525,16 +550,19 @@ def testCreateGpuH100(self):
       cluster._Create()
       self.assertIn(
           '--accelerator '
-          'type=nvidia-h100-80gb,count=2,gpu-driver-version=default',
+          + 'type=nvidia-h100-80gb,count=2,gpu-driver-version=default',
           issue_command.all_commands,
       )
 
 
 class GoogleKubernetesEngineGetNodesTestCase(GoogleKubernetesEngineTestCase):
+  """Tests for GKE node/instance-group enumeration methods."""
 
   def testGetInstanceGroups(self):
+    """Tests that _GetInstanceGroups parses node-pools list output."""
     path = os.path.join(os.path.dirname(__file__), _NODE_POOLS_LIST_OUTPUT)
-    output = open(path).read()
+    with open(path) as f:
+      output = f.read()
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects(stdout=output) as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -552,9 +580,13 @@ def testGetInstanceGroups(self):
       self.assertEqual(expected, set(instance_groups))  # order doesn't matter
 
   def testGetNodePoolNames(self):
-    output = ['default-pool', 'nap-e2-standard-2-iu4vquho', 'test-pool']
+    """Tests that GetNodePoolNames returns names from cluster describe."""
+    pool_names = ['default-pool', 'nap-e2-standard-2-iu4vquho', 'test-pool']
+    json_output = json.dumps(
+        {'nodePools': [{'name': n} for n in pool_names]}
+    )
     spec = self.create_kubernetes_engine_spec()
-    with self.patch_critical_objects(stdout='\n'.join(output)) as issue_command:
+    with self.patch_critical_objects(stdout=json_output) as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
       node_pools = cluster.GetNodePoolNames()
 
@@ -562,8 +594,8 @@ def testGetNodePoolNames(self):
           'gcloud container clusters describe ' + cluster.name,
           issue_command.all_commands,
       )
-      self.assertIn('--flatten', issue_command.all_commands)
-      self.assertIn('--format', issue_command.all_commands)
+      self.assertIn('--format json', issue_command.all_commands)
+      self.assertNotIn('--flatten', issue_command.all_commands)
 
       expected = {
           'default-pool',
@@ -574,9 +606,11 @@ def testGetNodePoolNames(self):
 
 
 class GoogleKubernetesEngineRegionalTestCase(PatchedObjectsTestCase):
+  """Tests for GKE regional cluster creation with multiple nodepools."""
 
   @staticmethod
   def create_kubernetes_engine_spec(use_zonal_nodepools=False):
+    """Creates a regional GKE spec with two nodepools."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -619,6 +653,7 @@ def create_kubernetes_engine_spec(use_zonal_nodepools=False):
     return kubernetes_engine_spec
 
   def testCreateRegionalCluster(self):
+    """Tests regional cluster creation with region-wide nodepools."""
     spec = self.create_kubernetes_engine_spec(use_zonal_nodepools=False)
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -668,6 +703,7 @@ def testCreateRegionalCluster(self):
       self.assertNotIn('--node-locations', create_nodepool2)
 
   def testCreateRegionalClusterZonalNodepool(self):
+    """Tests regional cluster creation with zone-pinned nodepools."""
     spec = self.create_kubernetes_engine_spec(use_zonal_nodepools=True)
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeCluster(spec)
@@ -706,9 +742,11 @@ def testCreateRegionalClusterZonalNodepool(self):
 
 
 class GoogleKubernetesEngineMachineFamiliesTestCase(PatchedObjectsTestCase):
+  """Tests for GKE nodepool creation with machine-family constraints."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE spec with a nodepool using machine families."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -735,6 +773,7 @@ def create_kubernetes_engine_spec():
     return kubernetes_engine_spec
 
   def testCreateWithMachineFamilies(self):
+    """Tests that machine-family nodepool issues a node-pools update command."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects() as issue_command, mock.patch.object(
         kubernetes_commands, 'ApplyYaml'
@@ -752,9 +791,11 @@ def testCreateWithMachineFamilies(self):
 
 
 class GoogleKubernetesEngineAutopilotTestCase(PatchedObjectsTestCase):
+  """Tests for GKE Autopilot cluster creation and metadata."""
 
   @staticmethod
   def create_kubernetes_engine_spec():
+    """Creates a GKE Autopilot cluster spec."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -771,6 +812,7 @@ def create_kubernetes_engine_spec():
     return kubernetes_engine_spec
 
   def testCreate(self):
+    """Tests Autopilot _Create uses create-auto without node flags."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects() as issue_command:
       cluster = google_kubernetes_engine.GkeAutopilotCluster(spec)
@@ -788,6 +830,7 @@ def testCreate(self):
       self.assertNotIn('--num-nodes', issue_command.all_commands)
 
   def testGetResourceMetadata(self):
+    """Tests that Autopilot metadata includes Auto values for size/type."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects():
       cluster = google_kubernetes_engine.GkeAutopilotCluster(spec)
@@ -818,8 +861,9 @@ def testGetResourceMetadataIncludesReleaseChannel(self):
           metadata,
       )
 
-  @flagsaver.flagsaver(run_uri='123')
+  @flagsaver.flagsaver(gpu_type='h100', gpu_count=1, run_uri='123')
   def testApplyYamlGpusH100(self):
+    """Tests Autopilot YAML generation for H100 GPU node selectors."""
     self.enter_context(
         mock.patch(
             gce_network.__name__ + '.GceFirewall.GetFirewall',
@@ -858,8 +902,6 @@ def testApplyYamlGpusH100(self):
         )
     )
     spec = self.create_kubernetes_engine_spec()
-    spec.vm_spec.gpu_count = 1
-    spec.vm_spec.gpu_type = 'h100'
     with self.assertLogs(level='INFO') as logs:
       cluster = google_kubernetes_engine.GkeAutopilotCluster(spec)
       yamls = kubernetes_commands.ConvertManifestToYamlDicts(
@@ -884,6 +926,7 @@ def testApplyYamlGpusH100(self):
     self.assertIn('cloud.google.com/compute-class: Accelerator', full_logs)
 
   def testGetMachineTypeFromNodeName(self):
+    """Tests GetMachineTypeFromNodeName queries kubectl for node type."""
     spec = self.create_kubernetes_engine_spec()
     with self.patch_critical_objects():
       cluster = google_kubernetes_engine.GkeAutopilotCluster(spec)
@@ -899,8 +942,10 @@ def testGetMachineTypeFromNodeName(self):
 
 
 class GoogleKubernetesEngineNodepoolAutoscalingTestCase(PatchedObjectsTestCase):
+  """Tests GKE per-nodepool autoscaling overrides cluster-level settings."""
 
   def testCreateWithPerNodepoolAutoscaling(self):
+    """Tests per-nodepool autoscaling settings override cluster defaults."""
     kubernetes_engine_spec = container_spec.ContainerClusterSpec(
         'NAME',
         **{
@@ -949,5 +994,298 @@ def testCreateWithPerNodepoolAutoscaling(self):
       self.assertIn('--max-nodes 10', nodepool_cmd)
 
 
+class GkeManagementPlaneTestCase(PatchedObjectsTestCase):
+  """Tests for GKE management-plane methods (k8s_management_benchmark)."""
+
+  @staticmethod
+  def create_kubernetes_engine_spec():
+    """Creates a GKE spec for management-plane method tests."""
+    return container_spec.ContainerClusterSpec(
+        'NAME',
+        **{
+            'cloud': 'GCP',
+            'vm_spec': {
+                'GCP': {
+                    'machine_type': 'fake-machine-type',
+                    'zone': 'us-central1-a',
+                },
+            },
+            'vm_count': 2,
+            'poll_for_events': False,
+        },
+    )
+
+  def _make_nodepool_config(self, name='pkbpool0'):
+    """Returns a minimal BaseNodePoolConfig-like object."""
+    cfg = mock.MagicMock()
+    cfg.name = name
+    cfg.num_nodes = 1
+    cfg.machine_type = 'n1-standard-2'
+    cfg.disk_size = 100
+    cfg.max_local_disks = 0
+    cfg.zone = None
+    return cfg
+
+  # ---- GetNodePoolNames (JSON format) ---------------------------------------
+
+  def testGetNodePoolNamesJsonFormat(self):
+    """GetNodePoolNames parses JSON cluster describe output."""
+    cluster_json = (
+        '{"nodePools": [{"name": "default-pool"}, {"name": "extra-pool"}]}'
+    )
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout=cluster_json) as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      names = cluster.GetNodePoolNames()
+
+      self.assertIn(
+          'gcloud container clusters describe ' + cluster.name,
+          issue_command.all_commands,
+      )
+      self.assertIn('--format', issue_command.all_commands)
+      # Must NOT use --flatten (old format)
+      self.assertNotIn('--flatten', issue_command.all_commands)
+      self.assertEqual({'default-pool', 'extra-pool'}, set(names))
+
+  def testGetNodePoolNamesEmptyFallback(self):
+    """GetNodePoolNames falls back to split() on non-JSON output."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='pool-a pool-b'):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      names = cluster.GetNodePoolNames()
+      self.assertEqual({'pool-a', 'pool-b'}, set(names))
+
+  # ---- CreateNodePool -------------------------------------------------------
+
+  def testCreateNodePool(self):
+    """CreateNodePool issues gcloud node-pools create with cluster flag."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('mypool')
+      cluster.CreateNodePool(cfg)
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools create mypool')
+      self.assertIn('--cluster', cmd)
+      self.assertNotIn('--node-version', cmd)
+
+  def testCreateNodePoolWithVersion(self):
+    """CreateNodePool passes --node-version when provided."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('mypool')
+      cluster.CreateNodePool(cfg, node_version='1.34.1-gke.100')
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools create mypool')
+      self.assertIn('--node-version 1.34.1-gke.100', cmd)
+
+  # ---- DeleteNodePool -------------------------------------------------------
+
+  def testDeleteNodePool(self):
+    """DeleteNodePool issues gcloud node-pools delete with --quiet."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster.DeleteNodePool('old-pool')
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools delete old-pool')
+      self.assertIn('--cluster', cmd)
+      self.assertIn('--quiet', cmd)
+
+  # ---- UpgradeNodePool ------------------------------------------------------
+
+  def testUpgradeNodePool(self):
+    """UpgradeNodePool issues gcloud clusters upgrade with --node-pool."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster.UpgradeNodePool('my-pool', '1.34.1-gke.200')
+
+      cmd = issue_command.GetCommandWithSubstring('clusters upgrade')
+      self.assertIn('--node-pool my-pool', cmd)
+      self.assertIn('--cluster-version 1.34.1-gke.200', cmd)
+      self.assertIn('--quiet', cmd)
+
+  # ---- UpdateCluster --------------------------------------------------------
+
+  def testUpdateCluster(self):
+    """UpdateCluster issues gcloud clusters update with a timestamp label."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster.UpdateCluster()
+
+      cmd = issue_command.GetCommandWithSubstring('clusters update')
+      self.assertIn('--update-labels', cmd)
+      self.assertIn('k8s-mgmt-ts=', cmd)
+
+  # ---- Async variants -------------------------------------------------------
+
+  def testCreateNodePoolAsyncReturnsOpName(self):
+    """CreateNodePoolAsync returns the GKE operation name."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(
+        stdout='extra line\noperation-1234\n'
+    ) as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('asyncpool')
+      handle = cluster.CreateNodePoolAsync(cfg)
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools create asyncpool')
+      self.assertIn('--async', cmd)
+      self.assertNotIn('--timeout', cmd)
+      self.assertEqual('operation-1234', handle)
+
+  def testCreateNodePoolAsyncWithVersion(self):
+    """CreateNodePoolAsync passes --node-version when provided."""
+    spec = self.create_kubernetes_engine_spec()
+    stdout = 'operation-5678\n'
+    with self.patch_critical_objects(stdout=stdout) as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('verpool')
+      cluster.CreateNodePoolAsync(cfg, node_version='1.33.5-gke.1')
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools create verpool')
+      self.assertIn('--node-version 1.33.5-gke.1', cmd)
+
+  def testDeleteNodePoolAsyncReturnsOpName(self):
+    """DeleteNodePoolAsync issues delete with --async."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='operation-del\n') as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      handle = cluster.DeleteNodePoolAsync('to-delete')
+
+      cmd = issue_command.GetCommandWithSubstring('node-pools delete to-delete')
+      self.assertIn('--async', cmd)
+      self.assertIn('--quiet', cmd)
+      self.assertEqual('operation-del', handle)
+
+  def testUpgradeNodePoolAsyncReturnsOpName(self):
+    """UpgradeNodePoolAsync issues upgrade with --async."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='operation-upg\n') as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      handle = cluster.UpgradeNodePoolAsync('my-pool', '1.34.2-gke.100')
+
+      cmd = issue_command.GetCommandWithSubstring('clusters upgrade')
+      self.assertIn('--async', cmd)
+      self.assertIn('--node-pool my-pool', cmd)
+      self.assertIn('--cluster-version 1.34.2-gke.100', cmd)
+      self.assertEqual('operation-upg', handle)
+
+  def testUpdateClusterAsyncReturnsOpName(self):
+    """UpdateClusterAsync issues clusters update with --async."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='operation-upd\n') as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      handle = cluster.UpdateClusterAsync()
+
+      cmd = issue_command.GetCommandWithSubstring('clusters update')
+      self.assertIn('--async', cmd)
+      self.assertIn('k8s-mgmt-ts=', cmd)
+      self.assertEqual('operation-upd', handle)
+
+  def testIssueAsyncRaisesOnNonZeroRetcode(self):
+    """_IssueAsync raises CreationError when the command fails."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stderr='boom', return_code=1):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('failpool')
+      with self.assertRaises(Exception):
+        cluster.CreateNodePoolAsync(cfg)
+
+  def testIssueAsyncRaisesOnEmptyOpName(self):
+    """_IssueAsync raises CreationError when stdout produces no op name."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='   \n   '):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cfg = self._make_nodepool_config('emptypool')
+      with self.assertRaises(Exception):
+        cluster.CreateNodePoolAsync(cfg)
+
+  # ---- WaitForOperation -----------------------------------------------------
+
+  def testWaitForOperationDone(self):
+    """WaitForOperation returns immediately when status is DONE."""
+    spec = self.create_kubernetes_engine_spec()
+    done_json = '{"status": "DONE"}'
+    with self.patch_critical_objects(stdout=done_json):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      # Should not raise
+      cluster.WaitForOperation('operation-xyz')
+
+  def testWaitForOperationAbortingRaises(self):
+    """WaitForOperation raises CreationError when status is ABORTING."""
+    spec = self.create_kubernetes_engine_spec()
+    aborted_json = '{"status": "ABORTING"}'
+    with self.patch_critical_objects(stdout=aborted_json):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      with self.assertRaises(errors.Resource.CreationError):
+        cluster.WaitForOperation('operation-bad')
+
+  # ---- ResolveNodePoolVersions ----------------------------------------------
+
+  def testResolveNodePoolVersions(self):
+    """ResolveNodePoolVersions returns (N-1 qualified, N qualified)."""
+    server_config = {
+        'validNodeVersions': [
+            '1.34.5-gke.100',
+            '1.34.3-gke.50',
+            '1.33.8-gke.200',
+            '1.33.5-gke.99',
+            '1.32.1-gke.10',
+        ]
+    }
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(
+        stdout=json.dumps(server_config)
+    ) as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      initial, target = cluster.ResolveNodePoolVersions()
+
+      cmd = issue_command.GetCommandWithSubstring('get-server-config')
+      self.assertIn('--format', cmd)
+      # target = newest overall = 1.34.5-gke.100
+      self.assertEqual('1.34.5-gke.100', target)
+      # initial = best version for minor 33 = 1.33.8-gke.200
+      self.assertEqual('1.33.8-gke.200', initial)
+
+  def testResolveNodePoolVersionsNoVersionsRaises(self):
+    """ResolveNodePoolVersions raises GetError when versions list is empty."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='{"validNodeVersions": []}'):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      with self.assertRaises(errors.Resource.GetError):
+        cluster.ResolveNodePoolVersions()
+
+  # ---- HasActiveUpgradeOperations -------------------------------------------
+
+  def testHasActiveUpgradeOperationsTrue(self):
+    """HasActiveUpgradeOperations returns True when an upgrade is running."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='operation-upgrade-123\n'):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      self.assertTrue(cluster.HasActiveUpgradeOperations())
+
+  def testHasActiveUpgradeOperationsFalse(self):
+    """HasActiveUpgradeOperations returns False when no upgrade is running."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout=''):
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      self.assertFalse(cluster.HasActiveUpgradeOperations())
+
+  def testHasActiveUpgradeOperationsUsesCorrectFilter(self):
+    """HasActiveUpgradeOperations queries for UPGRADE_NODES AND RUNNING."""
+    spec = self.create_kubernetes_engine_spec()
+    with self.patch_critical_objects(stdout='') as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster.HasActiveUpgradeOperations()
+
+      self.assertIn('operations list', issue_command.all_commands)
+      self.assertIn('UPGRADE_NODES', issue_command.all_commands)
+      self.assertIn('RUNNING', issue_command.all_commands)
+
+
 if __name__ == '__main__':
   unittest.main()