Fix/fix autotp universal checkpoint ci (#7937)

tohtana · web-flow · commit 3bdebc03f149 · 2026-03-31T09:41:53.000-04:00
The full CI test [fails](https://github.com/deepspeedai/DeepSpeed/actions/runs/23735417401/job/69138729446) throwing "RuntimeError: Cannot re-initialize CUDA" because of tests for universal checkpoint and AutoTP. It happens because they run `torch.cuda.current_device()` under `pytest --forked`. As the tests only touch universal checkpoint metadata, we won't need to call it. This PR skips constructor-time AutoTP materialization when `mp_group` is `None`. Partitioning still happens in the real AutoTP usage where an actual model-parallel group is given. --------- Signed-off-by: Masahiro Tanaka <mtanaka@anyscale.com>
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -369,6 +369,12 @@ def _set_param_uc_meta(self,
     def _mark_uc_metadata(self):
         return
 
+    def _should_materialize_tp_partition(self):
+        # AutoTP partitioning should only materialize parameters when an actual
+        # TP process group is present. Metadata-only construction with
+        # mp_group=None should not touch device placement.
+        return self.mp_group is not None
+
     def is_training_mode(self):
         global DEEPSPEED_AUTOTP_MODE
         return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING
@@ -579,7 +585,8 @@ def __init__(self, module, mp_group, **kwargs):
         self.weight = module.weight
         self.bias = module.bias
 
-        self._tp_partition([self.weight, self.bias])
+        if self._should_materialize_tp_partition():
+            self._tp_partition([self.weight, self.bias])
         self.support_training = True
         self.config_tp_params(self.weight)
         if self.bias is not None:
@@ -674,7 +681,7 @@ def __init__(self, module, mp_group=None, skip_partition=False, **kwargs):
         super(LinearLayer, self).__init__(mp_group, **kwargs)
         self.weight = module.weight
         self.bias = module.bias
-        if not skip_partition:
+        if not skip_partition and self._should_materialize_tp_partition():
             self._tp_partition([self.weight, self.bias])
         self.support_training = True
         self.config_tp_params(self.weight)
@@ -1234,7 +1241,8 @@ def __init__(self, module, mp_group, shape, partition_dim=0, **kwargs):
             raise ValueError(f"AutoTP layer '{self.name}' bias size {self.bias.numel()} does not match output shape "
                              f"{self._output_shape}.")
 
-        self._tp_partition([self.weight, self.bias])
+        if self._should_materialize_tp_partition():
+            self._tp_partition([self.weight, self.bias])
         self.support_training = True
         self.config_tp_params(self.weight)
         if self.bias is not None:
@@ -1352,7 +1360,8 @@ def __init__(self, module, mp_group, shape, partition_dim=1, **kwargs):
          self._bias_partition_dim) = _infer_subparam_logical_shapes(self._orig_weight_shape, self.shape,
                                                                     self.partition_dim, self.name)
 
-        self._tp_partition([self.weight, self.bias])
+        if self._should_materialize_tp_partition():
+            self._tp_partition([self.weight, self.bias])
         self.support_training = True
         self.config_tp_params(self.weight)
         if self.bias is not None: