diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 82a7592f14cb..9c8a3e85cf3e 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -70,7 +70,7 @@
     WEIGHT_QUANTIZE_ROUNDING, \
     WEIGHT_QUANTIZE_VERBOSE, \
     WEIGHT_QUANTIZE_KERNEL
-from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FROZEN_PARAM_FRAGMENTS, UNIVERSAL_CHECKPOINT_INFO
+from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FROZEN_PARAM_FRAGMENTS, UNIVERSAL_CHECKPOINT_INFO, BASE_OPTIMIZER_STATE
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
 from deepspeed.checkpoint.ds_to_universal import dp_index_to_str
 from deepspeed.runtime.sparse_tensor import SparseTensor
@@ -2072,6 +2072,7 @@ def _configure_zero_optimizer(self, optimizer):
                     enable_sanity_checks=self.is_sanity_checks_enabled(),
                     cpuadam_cores_perc=self.cpuadam_cores_perc(),
                     save_muon_momentum_buffer_in_memory=self.zero_save_muon_momentum_buffer_in_memory(),
+                    elastic_checkpoint=self.zero_elastic_checkpoint(),
                 )
 
         else:
@@ -3767,11 +3768,16 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
             checkpoint_folder = f'{os.path.join(load_dir, tag)}'
         else:
             if load_optimizer_states and self.seq_dp_world_size != self.loaded_checkpoint_dp_world_size:
-                raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
-                    f"world size of {self.loaded_checkpoint_dp_world_size} but the " \
-                    f"current world size is {self.seq_dp_world_size}. Automatic adjustment " \
-                    "of ZeRO's optimizer state partitioning with a new world size is not " \
-                    "currently supported.")
+                # ZeRO Stage 3 elastic checkpoint repartitions optimizer states internally
+                # across arbitrary DP world sizes; skip the guard for that case.
+                zero3_elastic = (self.zero_optimization_stage() == ZeroStageEnum.weights
+                                 and self.zero_elastic_checkpoint())
+                if not zero3_elastic:
+                    raise ZeRORuntimeException("The checkpoint being loaded used a DP " \
+                        f"world size of {self.loaded_checkpoint_dp_world_size} but the " \
+                        f"current world size is {self.seq_dp_world_size}. Automatic adjustment " \
+                        "of ZeRO's optimizer state partitioning with a new world size is not " \
+                        "currently supported.")
             checkpoint_folder = None
             zero_sd_list = self._get_all_zero_checkpoints(load_dir, tag)
             if zero_sd_list is None:
@@ -3822,13 +3828,26 @@ def _get_all_zero_checkpoint_names(self, load_dir, tag, bf16_mode):
         return zero_ckpt_names
 
     def _get_all_zero_checkpoint_state_dicts(self, zero_ckpt_names):
+        cur_rank = dist.get_rank(group=self.optimizer.dp_process_group)
+        # Load the current rank's checkpoint first to detect whether the on-disk format is
+        # elastic.  This allows autodetection even when elastic_checkpoint=False in the config
+        # (e.g., loading a checkpoint that was saved with elastic_checkpoint=True).
+        cached = {}
+        if cur_rank < len(zero_ckpt_names) and zero_ckpt_names[cur_rank] is not None:
+            cached[cur_rank] = self.checkpoint_engine.load(zero_ckpt_names[cur_rank], map_location='cpu')
+            cur_optim_sd = cached[cur_rank].get(OPTIMIZER_STATE_DICT)
+            ckpt_is_elastic = self.zero_elastic_checkpoint() or (cur_optim_sd is not None
+                                                                 and BASE_OPTIMIZER_STATE in cur_optim_sd)
+        else:
+            ckpt_is_elastic = self.zero_elastic_checkpoint()
+
         zero_sd_list = []
         for i, ckpt_name in enumerate(zero_ckpt_names):
-            _state = None
-            if ckpt_name is None:
+            if i in cached:
+                _state = cached[i]
+            elif ckpt_name is None:
                 _state = {OPTIMIZER_STATE_DICT: None}
-            # Fully load state for current rank
-            elif self.zero_elastic_checkpoint() or dist.get_rank(group=self.optimizer.dp_process_group) == i:
+            elif ckpt_is_elastic or dist.get_rank(group=self.optimizer.dp_process_group) == i:
                 _state = self.checkpoint_engine.load(
                     ckpt_name,
                     map_location='cpu',
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index 7ebb42905456..d04b404fe392 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -34,7 +34,9 @@
 from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
 from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
-from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
+from deepspeed.checkpoint.constants import (OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE,
+                                            LOSS_SCALER, BASE_OPTIMIZER_STATE, BASE_OPTIMIZER_STATE_STEP,
+                                            GROUP_PADDINGS)
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.zero.muon.original_muon import muon_update
 from deepspeed.runtime.zero.muon.muon_optimizer import MuonWithAuxAdam
@@ -2954,6 +2956,51 @@ def _clear_fp32_optimizer_param_groups(self):
         for param_group in self.optimizer.param_groups:
             param_group['params'] = []
 
+    def _get_elastic_optimizer_state(self):
+        # Return per-sub-group lean optimizer states (per-param tensors with padding stripped).
+        # Lean format allows the checkpoint to be loaded with a different DP world size.
+        sub_group_states = []
+        for sub_group_id, _ in enumerate(self.fp16_groups):
+            fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
+            if fp32_param not in self.optimizer.state or not self.optimizer.state[fp32_param]:
+                sub_group_states.append({})
+                continue
+            lean_state = {}
+            for key, value in self.optimizer.state[fp32_param].items():
+                if torch.is_tensor(value):
+                    lean_state[key] = self._get_lean_tensors(value, self.fp16_partitioned_groups[sub_group_id],
+                                                             self.groups_padding[sub_group_id])
+                else:
+                    lean_state[key] = value
+            sub_group_states.append(lean_state)
+        return sub_group_states
+
+    def _elastic_state_dict(self):
+        # Build a world-size-agnostic checkpoint by saving lean (padding-stripped) fp32 and
+        # optimizer-state partitions.  Each sub-group entry is a list of per-param lean tensors
+        # so that loading code can merge ranks and re-partition for any DP world size.
+        state_dict = {}
+        state_dict[ZERO_STAGE] = ZeroStageEnum.weights
+        state_dict[LOSS_SCALER] = self.loss_scaler
+        state_dict['dynamic_loss_scale'] = self.dynamic_loss_scale
+        state_dict['overflow'] = self.overflow
+        state_dict[PARTITION_COUNT] = self.partition_count
+
+        # Save padded flat tensors (same layout as rigid format) so that zero_to_fp32.py can
+        # reconstruct model weights without knowing the DP world size used at save time.
+        state_dict[FP32_FLAT_GROUPS] = list(self.fp32_partitioned_groups_flat)
+
+        self._set_fp32_optimizer_param_groups()
+        state_dict[BASE_OPTIMIZER_STATE] = self._get_elastic_optimizer_state()
+        if self.optimizer.param_groups and "step" in self.optimizer.param_groups[0]:
+            assert all(pg["step"] == self.optimizer.param_groups[0]["step"] for pg in self.optimizer.param_groups), \
+                "All param groups must have the same step value"
+            state_dict[BASE_OPTIMIZER_STATE_STEP] = self.optimizer.param_groups[0]["step"]
+        self._clear_fp32_optimizer_param_groups()
+
+        state_dict[GROUP_PADDINGS] = self.groups_padding
+        return state_dict
+
     def _rigid_state_dict(self):
         state_dict = {}
         state_dict[ZERO_STAGE] = ZeroStageEnum.weights
@@ -2981,7 +3028,7 @@ def state_dict(self):
             torch.save(checkpoint, "saved.pth")
         """
         if self.elastic_checkpoint:
-            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
+            return self._elastic_state_dict()
 
         return self._rigid_state_dict()
 
@@ -3055,6 +3102,121 @@ def _restore_base_optimizer_state(self, all_state_dict):
                 else:
                     self.optimizer.state[p][key] = saved
 
+    def _repartition_for_current_rank(self, lean_parts_per_ckpt_rank, param):
+        # Merge lean (padding-stripped) partitions from all checkpoint ranks into the full
+        # parameter, then slice out the current rank's portion under the new world size.
+        full_tensor = torch.cat(lean_parts_per_ckpt_rank)
+        rank = dist.get_rank(group=self.dp_process_group)
+        partition_size = param.partition_numel()
+        start = rank * partition_size
+        end = min(start + partition_size, param.ds_numel)
+        local_part = full_tensor[start:end]
+        # Last rank may need zero-padding to fill its partition slot.
+        if local_part.numel() < partition_size:
+            pad = torch.zeros(partition_size - local_part.numel(), dtype=local_part.dtype, device=local_part.device)
+            local_part = torch.cat([local_part, pad])
+        return local_part
+
+    def _restore_from_elastic_fp32_partitions(self, all_state_dict):
+        # Rebuild fp32 master weights from an elastic checkpoint that may have been saved with
+        # a different DP world size.  FP32_FLAT_GROUPS stores padded flat tensors (one per
+        # sub-group per checkpoint rank), identical in layout to the rigid format, so the same
+        # per-param partition-size arithmetic applies.
+        ckpt_world_size = len(all_state_dict)
+        for sub_group_id, fp32_flat in enumerate(self.fp32_partitioned_groups_flat):
+            params = self.fp16_groups[sub_group_id]
+            # Compute each param's partition size under the checkpoint world size.
+            ckpt_partition_sizes = [(p.ds_numel + ckpt_world_size - 1) // ckpt_world_size for p in params]
+            new_parts = []
+            for param_idx, param in enumerate(params):
+                ckpt_ps = ckpt_partition_sizes[param_idx]
+                lean_parts = []
+                for rank, sd in enumerate(all_state_dict):
+                    padded_flat = sd[FP32_FLAT_GROUPS][sub_group_id]
+                    # Split the rank's padded flat group into one chunk per param.
+                    per_param_chunks = padded_flat.split(ckpt_partition_sizes)
+                    padded_chunk = per_param_chunks[param_idx]
+                    # Strip padding: the real element count for this rank/param pair.
+                    lean_len = max(0, min(ckpt_ps, param.ds_numel - rank * ckpt_ps))
+                    lean_parts.append(padded_chunk[:lean_len])
+                new_parts.append(self._repartition_for_current_rank(lean_parts, param))
+            fp32_flat.data.copy_(torch.cat(new_parts).to(fp32_flat.dtype))
+
+    def _restore_elastic_optimizer_state(self, all_state_dict):
+        # Rebuild per-sub-group optimizer states from an elastic checkpoint.
+        step = None
+        if BASE_OPTIMIZER_STATE_STEP in all_state_dict[0]:
+            assert all(sd[BASE_OPTIMIZER_STATE_STEP] == all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+                       for sd in all_state_dict), "Checkpoint ranks have inconsistent step values"
+            step = all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+
+        for sub_group_id, _ in enumerate(self.fp16_groups):
+            fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
+            params = self.fp16_groups[sub_group_id]
+            all_sub_group_states = [sd[BASE_OPTIMIZER_STATE][sub_group_id] for sd in all_state_dict]
+            if not all_sub_group_states[0]:
+                continue
+            restored_state = {}
+            for key in all_sub_group_states[0].keys():
+                sample = all_sub_group_states[0][key]
+                if isinstance(sample, list) and len(sample) > 0 and torch.is_tensor(sample[0]):
+                    new_parts = [
+                        self._repartition_for_current_rank(
+                            [rank_state[key][param_idx] for rank_state in all_sub_group_states], param)
+                        for param_idx, param in enumerate(params)
+                    ]
+                    # Move to the same device and dtype as fp32_param so the fused optimizer
+                    # does not raise "tensor not on same device" during step().
+                    restored_state[key] = torch.cat(new_parts).to(device=fp32_param.device, dtype=fp32_param.dtype)
+                else:
+                    restored_state[key] = sample
+            self.optimizer.state[fp32_param] = restored_state
+
+        if step is not None:
+            for param_group in self.optimizer.param_groups:
+                param_group['step'] = step
+
+    def _elastic_load_state_dict(self, all_state_dict, load_optimizer_states=True, load_from_fp32_weights=False):
+        # Load a ZeRO-3 elastic checkpoint.  The checkpoint may have been saved with a different
+        # DP world size; elastic merge-and-repartition handles the mismatch transparently.
+        sd0 = all_state_dict[0]
+        self.loss_scaler = sd0[LOSS_SCALER]
+        self.dynamic_loss_scale = sd0['dynamic_loss_scale']
+        self.overflow = sd0['overflow']
+
+        if load_optimizer_states:
+            self._set_fp32_optimizer_param_groups()
+            self._restore_elastic_optimizer_state(all_state_dict)
+            self._clear_fp32_optimizer_param_groups()
+
+        if self.swap_optimizer:
+            self.optimizer_swapper.purge_state()
+            timer_names = set()
+            self._partition_all_parameters()
+            for sub_group_id, group in enumerate(self.fp16_groups):
+                self._prepare_sub_group(sub_group_id, timer_names)
+                self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
+                self._release_sub_group(sub_group_id, timer_names)
+            self._post_step(timer_names)
+
+        if load_from_fp32_weights:
+            self._restore_from_elastic_fp32_partitions(all_state_dict)
+        else:
+            self._restore_from_bit16_weights()
+
+        # Sync fp16 partitions from the (now-updated) fp32 master weights.
+        for sub_group_id in range(len(self.fp32_partitioned_groups_flat)):
+            fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
+            if sum(fp32_param.size()) > 0:
+                fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
+                fp16_param.data.copy_(fp32_param.data)
+
+        for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
+            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id])
+            for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
+                partitioned_param.data = q.data
+
     def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
         # I think it should actually be ok to reload the optimizer before the model.
         self.loss_scaler = state_dict[LOSS_SCALER]
@@ -3099,7 +3261,6 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
             for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
                 partitioned_param.data = q.data
 
-    # TODO: Support different/changing load/save DP degree.
     def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
@@ -3132,11 +3293,19 @@ def load_state_dict(self,
             optimizer.load_state_dict(checkpoint['optimizer'])
         """
 
-        if self.elastic_checkpoint:
-            raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
-
         if checkpoint_folder:
             self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
+            return
+
+        # Detect elastic vs. rigid format by the presence of BASE_OPTIMIZER_STATE.
+        # Elastic checkpoints store lean per-param tensors and support arbitrary DP world sizes.
+        # Use the current rank's own state dict for detection; other slots may be None when the
+        # engine used lazy-loading (only fetched the current rank's file).
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        own_sd = state_dict_list[my_rank] if my_rank < len(state_dict_list) else None
+        ckpt_is_elastic = own_sd is not None and BASE_OPTIMIZER_STATE in own_sd
+        if ckpt_is_elastic:
+            self._elastic_load_state_dict(state_dict_list, load_optimizer_states, load_from_fp32_weights)
         else:
             self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
                                         load_optimizer_states=load_optimizer_states)
@@ -3153,9 +3322,9 @@ def load_state_dict(self,
                 if local_rank != rank_end:
                     dist.send(tensor=load_serial, dst=rank + 1)
 
-            if len(self.persistent_parameters) > 0:
-                self.persistent_parameters[0].partition(self.persistent_parameters)
-                # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
+        if len(self.persistent_parameters) > 0:
+            self.persistent_parameters[0].partition(self.persistent_parameters)
+            # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
 
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
         self.load_hp_checkpoint_state_from_checkpoint_dir_stage3(checkpoint_folder)
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index 07e5ca9c1f5a..68daf188ae8b 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -246,6 +246,63 @@ registration and this step is no longer needed.
 {: .notice--info}
 
 
+## ZeRO-3 Elastic Checkpoints
+
+An *elastic checkpoint* is a checkpoint that can be saved with one data-parallel world size and loaded with a different one.
+This is useful when you need to scale training up or down, resume on fewer GPUs after a preemption, or do inference evaluation on a single machine.
+
+ZeRO Stage 1 and 2 have supported elastic checkpoints for a long time via the `elastic_checkpoint` configuration flag.
+**This PR extends the same support to ZeRO Stage 3.**
+
+### Enabling elastic checkpoints
+
+Set `elastic_checkpoint: true` in the `zero_optimization` block:
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "elastic_checkpoint": true
+    }
+}
+```
+
+No other code changes are required.
+`save_checkpoint()` and `load_checkpoint()` handle the rest automatically.
+
+### How it works
+
+**Saving.**  Instead of writing one flat optimizer-state tensor per rank (which encodes the world size in its offsets), the elastic format writes *lean* per-parameter tensors — the portion of each parameter that belongs to the saving rank, with alignment padding stripped.
+The fp32 master weights are saved in the same padded-flat layout as the rigid format so that `zero_to_fp32.py` still works unchanged.
+
+**Loading.**  On load, DeepSpeed auto-detects the format by checking for the `base_optimizer_state` key in the checkpoint files.
+If the key is present (elastic format), it:
+
+1. Merges the lean parameter shards from all checkpoint ranks into the full parameter tensor.
+2. Re-slices it for the current rank under the new world size.
+3. Restores Adam moment tensors (`exp_avg`, `exp_avg_sq`) using the same merge-and-repartition logic.
+
+This means a checkpoint saved with N GPUs loads correctly on M GPUs for any M.
+
+### Changing world size example
+
+```python
+# --- training run on 4 GPUs, elastic_checkpoint=True ---
+ds_engine.save_checkpoint(checkpoint_dir)
+
+# --- later, resume on 2 GPUs (same elastic_checkpoint=True config) ---
+ds_engine.load_checkpoint(checkpoint_dir, load_optimizer_states=True)
+# training continues normally
+```
+
+Because autodetection is based on checkpoint contents rather than the runtime config flag, you can also load an elastic checkpoint with `elastic_checkpoint: false` in your config and it will still work correctly.
+
+### Limitations
+
+- Elastic checkpoints require `stage: 3`. Stages 1 and 2 use a separate (older) implementation.
+- The `swap_optimizer` (NVMe offload) path is supported but has not been tested with world-size changes.
+- `load_from_fp32_weights: true` is compatible with elastic checkpoints and is the recommended path when resuming on different hardware for maximum precision.
+
 ## Extracting weights
 
 If you need to take the pretrained weights out of Deepspeed here is what you can do for getting fp16 weights:
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index 6447d1821cd1..ff61f760fdb2 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -7,6 +7,7 @@
 from types import SimpleNamespace
 from deepspeed.ops.op_builder import CPUAdamBuilder
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
+from deepspeed.checkpoint.constants import (BASE_OPTIMIZER_STATE, FP32_FLAT_GROUPS, OPTIMIZER_STATE_DICT)
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.zero import ZeroParamStatus
 from deepspeed.utils.torch import required_torch_version
@@ -241,6 +242,56 @@ def run(self, class_tmpdir):
         ds_model.save_checkpoint(class_tmpdir)
 
 
+# DistributedFixture that saves a ZeRO-3 elastic checkpoint from 4 GPUs so that
+# TestZeROStage3ElasticCheckpoint (world_size=2) can test cross-world-size loading.
+class ws4_zero3_elastic_checkpoint(DistributedFixture):
+    world_size = 4
+
+    def run(self, class_tmpdir):
+        config_dict = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
+        # One step is enough to populate optimizer states; keeping it minimal reduces fixture time.
+        data_loader = random_dataloader(model=ds_model, total_samples=4, hidden_dim=hidden_dim, device=ds_model.device)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
+        ds_model.empty_partition_cache()
+        ds_model.save_checkpoint(class_tmpdir)
+
+        # Gather full fp32 parameters on rank 0 and save as a reference for the
+        # cross-world-size numerical correctness test.
+        ref_params = {}
+        param_names = [name for name, _ in ds_model.module.named_parameters()]
+        params = list(ds_model.module.parameters())
+        dist.barrier()
+        with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+            if dist.get_rank() == 0:
+                for name, p in zip(param_names, params):
+                    ref_params[name] = p.detach().cpu().float().clone()
+            dist.barrier()
+        if dist.get_rank() == 0:
+            torch.save(ref_params, os.path.join(class_tmpdir, "reference_params.pt"))
+        dist.barrier()
+
+
 @pytest.mark.parametrize("elastic_save", [True, False])
 @pytest.mark.parametrize("elastic_load", [True, False])
 @pytest.mark.parametrize("load_optim", [True, False])
@@ -730,33 +781,257 @@ def test_load_zeropp_model(self, ws4_model_checkpoint_zeropp, class_tmpdir):
             for v in ds_param.data.cpu().flatten().numpy():
                 assert v == 1.0
 
-    def test_load_zeropp_checkpoint(self, ws4_model_checkpoint_zeropp, class_tmpdir):
+
+class TestZeROStage3ElasticCheckpoint(DistributedTest):
+    """Unit tests for ZeRO Stage 3 elastic checkpoint save/load."""
+
+    world_size = 2
+
+    def test_elastic_checkpoint_same_world_size(self, tmpdir):
+        # Round-trip: save elastic checkpoint with world_size=2, load with world_size=2.
+        # Verifies that both model weights and optimizer states are faithfully restored.
         config_dict = {
-            "train_batch_size": 4,
+            "train_batch_size": 2,
             "optimizer": {
-                "type": 'Adam'
+                "type": "Adam"
             },
             "zero_optimization": {
                 "stage": 3,
-                "zero_hpz_partition_size": 2,
-                "stage3_param_persistence_threshold": 1
+                "elastic_checkpoint": True
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            models = [SimpleModel(hidden_dim) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
+
+    def test_elastic_checkpoint_no_optimizer_states(self, tmpdir):
+        # Round-trip: save elastic checkpoint, load without optimizer states.
+        # Only model weights should be compared; optimizer state is intentionally skipped.
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True
             }
         }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
 
-        # Init model and load zero checkpoint
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            models = [SimpleModel(hidden_dim) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=False)
+
+    def test_elastic_state_dict_format(self, tmpdir):
+        # Verify that elastic_checkpoint=True produces a state dict with BASE_OPTIMIZER_STATE
+        # (list of per-param lean tensors per sub-group) and not the rigid OPTIMIZER_STATE_DICT.
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
         hidden_dim = 10
-        model = SimpleModel(hidden_dim)
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim)
         ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
-        ds_model.load_checkpoint(class_tmpdir,
-                                 load_optimizer_states=True,
-                                 load_lr_scheduler_states=False,
-                                 load_module_only=False)
+        data_loader = random_dataloader(model=ds_model, total_samples=4, hidden_dim=hidden_dim, device=ds_model.device)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
 
-        # Check the parameters after gather
-        params_to_gather = [p for p in ds_model.module.parameters() if p.ds_status == ZeroParamStatus.NOT_AVAILABLE]
-        if len(params_to_gather) > 0:
-            handle = params_to_gather[0].all_gather_coalesced(params_to_gather)
-            handle.wait()
-        for ds_param in params_to_gather:
-            for v in ds_param.data.cpu().flatten().numpy():
-                assert v == 1.0
+        sd = ds_model.optimizer.state_dict()
+        assert BASE_OPTIMIZER_STATE in sd
+        assert OPTIMIZER_STATE_DICT not in sd
+        assert FP32_FLAT_GROUPS in sd
+        # Each sub-group entry in FP32_FLAT_GROUPS should be a flat fp32 tensor (same layout as
+        # the rigid format so that zero_to_fp32.py can reconstruct model weights unchanged).
+        import torch
+        for sub_group in sd[FP32_FLAT_GROUPS]:
+            assert torch.is_tensor(sub_group)
+
+    def test_rigid_state_dict_format(self, tmpdir):
+        # Verify that elastic_checkpoint=False (default) produces a state dict with
+        # OPTIMIZER_STATE_DICT and not BASE_OPTIMIZER_STATE.
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": False
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
+        data_loader = random_dataloader(model=ds_model, total_samples=4, hidden_dim=hidden_dim, device=ds_model.device)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
+
+        sd = ds_model.optimizer.state_dict()
+        assert OPTIMIZER_STATE_DICT in sd
+        assert BASE_OPTIMIZER_STATE not in sd
+
+    def test_elastic_format_autodetected_on_load(self, tmpdir):
+        # A checkpoint saved with elastic_checkpoint=True must load correctly even when the
+        # loading engine is configured with elastic_checkpoint=False, because load_state_dict()
+        # auto-detects the format via the BASE_OPTIMIZER_STATE key.
+        save_config = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True
+            }
+        }
+        load_config = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": False
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            save_config["bf16"] = {"enabled": True}
+            load_config["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            save_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+            load_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=save_config):
+            model_save = SimpleModel(hidden_dim)
+        with deepspeed.zero.Init(config_dict_or_path=load_config):
+            model_load = SimpleModel(hidden_dim)
+
+        ds_save = create_deepspeed_model(config_dict=save_config, model=model_save, base_optimizer=None)
+        data_loader = random_dataloader(model=ds_save, total_samples=4, hidden_dim=hidden_dim, device=ds_save.device)
+        for _, batch in enumerate(data_loader):
+            loss = ds_save(batch[0], batch[1])
+            ds_save.backward(loss)
+            ds_save.step()
+        ds_save.empty_partition_cache()
+        ds_save.save_checkpoint(tmpdir)
+
+        dist.barrier()
+
+        ds_load = create_deepspeed_model(config_dict=load_config, model=model_load, base_optimizer=None)
+        ds_load.load_checkpoint(tmpdir, load_optimizer_states=True)
+        compare_model_states(ds_save, ds_load, compare_optimizer=True)
+
+    def test_elastic_checkpoint_change_world_size(self, ws4_zero3_elastic_checkpoint, class_tmpdir):
+        # Load a ZeRO-3 elastic checkpoint saved with 4 GPUs onto a 2-GPU engine.
+        # Verifies both that loading succeeds and that the repartitioned parameters
+        # are numerically identical to the reference weights saved by the fixture.
+        config_dict = {
+            "train_batch_size": 4,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim)
+        ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
+        ds_model.load_checkpoint(class_tmpdir, load_optimizer_states=True)
+
+        # Compare repartitioned parameters against the fp32 reference saved by the fixture.
+        # Cloning must happen inside GatheredParameters (while params are gathered on rank 0)
+        # but comparison happens outside — torch.allclose on a GPU tensor vs. CPU tensor raises
+        # RuntimeError, which would skip the inner barrier and deadlock the broadcast in __exit__.
+        ref_params = torch.load(os.path.join(class_tmpdir, "reference_params.pt"), weights_only=False)
+        # Build name list outside the context: named_parameters() inside GatheredParameters can
+        # trigger per-param ZeRO-3 all_gather hooks and cause a deadlock.
+        param_names = [name for name, _ in ds_model.module.named_parameters()]
+        params = list(ds_model.module.parameters())
+        loaded_fp32 = {}
+        dist.barrier()
+        with deepspeed.zero.GatheredParameters(params, modifier_rank=0):
+            if dist.get_rank() == 0:
+                for name, p in zip(param_names, params):
+                    loaded_fp32[name] = p.detach().cpu().float().clone()
+            # Both ranks must reach __exit__ together for the broadcast collective.
+            dist.barrier()
+
+        mismatches = []
+        if dist.get_rank() == 0:
+            for name in param_names:
+                if not torch.allclose(loaded_fp32[name], ref_params[name], rtol=1e-3, atol=1e-3):
+                    mismatches.append(name)
+        assert not mismatches, f"Parameter(s) mismatch after cross-world-size elastic load: {mismatches}"
+
+        # Confirm training can proceed normally after cross-world-size checkpoint load.
+        data_loader = random_dataloader(model=ds_model, total_samples=4, hidden_dim=hidden_dim, device=ds_model.device)
+        for _, batch in enumerate(data_loader):
+            loss = ds_model(batch[0], batch[1])
+            ds_model.backward(loss)
+            ds_model.step()
+
+    def test_elastic_checkpoint_load_from_fp32_weights(self, tmpdir):
+        # Verify the load_from_fp32_weights=True path: _restore_from_elastic_fp32_partitions()
+        # is exercised instead of _restore_from_bit16_weights().
+        config_dict = {
+            "train_batch_size": 2,
+            "optimizer": {
+                "type": "Adam"
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "elastic_checkpoint": True,
+                "load_from_fp32_weights": True,
+            }
+        }
+        if get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        hidden_dim = 10
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            models = [SimpleModel(hidden_dim) for _ in range(2)]
+        checkpoint_correctness_verification(config_dict, models, hidden_dim, tmpdir, load_optimizer_states=True)
diff --git a/tests/unit/common.py b/tests/unit/common.py
index 426b974eefb4..b507ef054c06 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -414,6 +414,28 @@ def test(self, distributed_fixture_example, regular_pytest_fixture, class_tmpdir
     _pytestfixturefunction = FixtureFunctionMarker(scope="function", params=None)
     __name__ = ""
 
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        # pytest >= 9 no longer registers classes with _pytestfixturefunction as fixtures.
+        # Inject a proper @pytest.fixture wrapper into the subclass's defining module so
+        # that pytest discovers it during collection while keeping backward compatibility
+        # with older pytest (which uses _pytestfixturefunction on the class itself).
+        if int(pytest.__version__.split(".")[0]) < 9:
+            return
+        import sys
+        module = sys.modules.get(cls.__module__)
+        if module is None:
+            return
+        cls_ref = cls
+
+        def _fixture_fn(request):
+            cls_ref()(request)
+
+        _fixture_fn.__name__ = cls.__name__
+        _fixture_fn.__qualname__ = cls.__qualname__
+        _fixture_fn = pytest.fixture(scope="function", name=cls.__name__)(_fixture_fn)
+        setattr(module, f"_{cls.__name__}_fixture_fn_", _fixture_fn)
+
     def __init__(self):
         assert isinstance(self.world_size, int), "Only one world size is allowed for distributed fixtures"
         self.__name__ = type(self).__name__