Skip to content

Commit 3bdebc0

Browse files
authored
Fix/fix autotp universal checkpoint ci (#7937)
The full CI test [fails](https://github.com/deepspeedai/DeepSpeed/actions/runs/23735417401/job/69138729446) throwing "RuntimeError: Cannot re-initialize CUDA" because of tests for universal checkpoint and AutoTP. It happens because they run `torch.cuda.current_device()` under `pytest --forked`. As the tests only touch universal checkpoint metadata, we won't need to call it. This PR skips constructor-time AutoTP materialization when `mp_group` is `None`. Partitioning still happens in the real AutoTP usage where an actual model-parallel group is given. --------- Signed-off-by: Masahiro Tanaka <mtanaka@anyscale.com>
1 parent 89bf0d2 commit 3bdebc0

1 file changed

Lines changed: 13 additions & 4 deletions

File tree

deepspeed/module_inject/layers.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,12 @@ def _set_param_uc_meta(self,
369369
def _mark_uc_metadata(self):
370370
return
371371

372+
def _should_materialize_tp_partition(self):
373+
# AutoTP partitioning should only materialize parameters when an actual
374+
# TP process group is present. Metadata-only construction with
375+
# mp_group=None should not touch device placement.
376+
return self.mp_group is not None
377+
372378
def is_training_mode(self):
373379
global DEEPSPEED_AUTOTP_MODE
374380
return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING
@@ -579,7 +585,8 @@ def __init__(self, module, mp_group, **kwargs):
579585
self.weight = module.weight
580586
self.bias = module.bias
581587

582-
self._tp_partition([self.weight, self.bias])
588+
if self._should_materialize_tp_partition():
589+
self._tp_partition([self.weight, self.bias])
583590
self.support_training = True
584591
self.config_tp_params(self.weight)
585592
if self.bias is not None:
@@ -674,7 +681,7 @@ def __init__(self, module, mp_group=None, skip_partition=False, **kwargs):
674681
super(LinearLayer, self).__init__(mp_group, **kwargs)
675682
self.weight = module.weight
676683
self.bias = module.bias
677-
if not skip_partition:
684+
if not skip_partition and self._should_materialize_tp_partition():
678685
self._tp_partition([self.weight, self.bias])
679686
self.support_training = True
680687
self.config_tp_params(self.weight)
@@ -1234,7 +1241,8 @@ def __init__(self, module, mp_group, shape, partition_dim=0, **kwargs):
12341241
raise ValueError(f"AutoTP layer '{self.name}' bias size {self.bias.numel()} does not match output shape "
12351242
f"{self._output_shape}.")
12361243

1237-
self._tp_partition([self.weight, self.bias])
1244+
if self._should_materialize_tp_partition():
1245+
self._tp_partition([self.weight, self.bias])
12381246
self.support_training = True
12391247
self.config_tp_params(self.weight)
12401248
if self.bias is not None:
@@ -1352,7 +1360,8 @@ def __init__(self, module, mp_group, shape, partition_dim=1, **kwargs):
13521360
self._bias_partition_dim) = _infer_subparam_logical_shapes(self._orig_weight_shape, self.shape,
13531361
self.partition_dim, self.name)
13541362

1355-
self._tp_partition([self.weight, self.bias])
1363+
if self._should_materialize_tp_partition():
1364+
self._tp_partition([self.weight, self.bias])
13561365
self.support_training = True
13571366
self.config_tp_params(self.weight)
13581367
if self.bias is not None:

0 commit comments

Comments
 (0)