deepspeedai
diff --git a/‎.github/workflows/aws-torch-latest-full.yml‎
Lines changed: 30 additions & 11 deletions b/‎.github/workflows/aws-torch-latest-full.yml‎
Lines changed: 30 additions & 11 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 1 addition & 2 deletions b/‎AGENTS.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 1 addition & 2 deletions b/‎CLAUDE.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎deepspeed/compile/config.py‎
Lines changed: 6 additions & 0 deletions b/‎deepspeed/compile/config.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎deepspeed/compile/constants.py‎
Lines changed: 11 additions & 0 deletions b/‎deepspeed/compile/constants.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎deepspeed/compile/custom_ops/__init__.py‎
Lines changed: 9 additions & 0 deletions b/‎deepspeed/compile/custom_ops/__init__.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎deepspeed/compile/custom_ops/all_to_all.py‎
Lines changed: 92 additions & 0 deletions b/‎deepspeed/compile/custom_ops/all_to_all.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎deepspeed/compile/custom_ops/sp_compat.py‎
Lines changed: 24 additions & 0 deletions b/‎deepspeed/compile/custom_ops/sp_compat.py‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎deepspeed/compile/custom_ops/sp_dp_registry.py‎
Lines changed: 67 additions & 0 deletions b/‎deepspeed/compile/custom_ops/sp_dp_registry.py‎
Lines changed: 67 additions & 0 deletions
@@ -2,13 +2,13 @@
 # DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
 #
 # Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
-# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
+# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
+# fallback to 8x A100 nodes when L40S capacity is unavailable.
 #
 # This workflow runs:
 # - Parallel tests with pytest-xdist (-n 8)
 # - Sequential tests marked with @pytest.mark.sequential
-#
-# Nightly schedule: skips if no new commits since last successful run.
+# - Nightly schedule: skips if no new commits since last successful run
 ################################################################################
 
 name: aws-torch-latest-full
@@ -26,7 +26,6 @@ jobs:
   check-changes:
     name: Check for new commits
     runs-on: ubuntu-latest
-    # Only check on schedule; workflow_dispatch always runs
     if: github.event_name == 'schedule'
     outputs:
       has_changes: ${{ steps.check.outputs.has_changes }}
@@ -38,28 +37,26 @@ jobs:
         run: |
           default_branch="${{ github.event.repository.default_branch }}"
 
-          # Get the HEAD SHA of the last successful run of this workflow
           last_sha=$(gh api \
             "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
             --jq '.workflow_runs[0].head_sha // empty')
 
           current_sha="${{ github.sha }}"
 
           if [ -z "$last_sha" ]; then
-            echo "No previous successful run found — running tests"
+            echo "No previous successful run found - running tests"
             echo "has_changes=true" >> "$GITHUB_OUTPUT"
           elif [ "$last_sha" = "$current_sha" ]; then
-            echo "No new commits since last successful run ($last_sha) — skipping"
+            echo "No new commits since last successful run ($last_sha) - skipping"
             echo "has_changes=false" >> "$GITHUB_OUTPUT"
           else
-            echo "New commits detected: $last_sha -> $current_sha — running tests"
+            echo "New commits detected: $last_sha -> $current_sha - running tests"
             echo "has_changes=true" >> "$GITHUB_OUTPUT"
           fi
 
   unit-tests:
     name: Unit Tests (Full)
     needs: [check-changes]
-    # Run if: (a) workflow_dispatch, or (b) schedule with new commits
     if: |
       always() &&
       (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
@@ -134,8 +131,30 @@ jobs:
           echo "CUTLASS_PATH: $CUTLASS_PATH"
           ls -la $CUTLASS_PATH/include/ | head -5
 
+      - name: Detect GPU architecture
+        run: |
+          python - <<'PY'
+          import os
+          import torch
+
+          torch.cuda.init()
+          major, minor = torch.cuda.get_device_capability(0)
+          arch = f"{major}.{minor}"
+          gpu_count = torch.cuda.device_count()
+          gpu_name = torch.cuda.get_device_name(0)
+
+          with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
+              env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
+              env_file.write(f"GPU_COUNT={gpu_count}\n")
+
+          print(f"Detected GPU: {gpu_name}")
+          print(f"Detected compute capability: {arch}")
+          print(f"Detected GPU count: {gpu_count}")
+          PY
+
       - name: Install DeepSpeed
         run: |
+          echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
           # Initialize CUDA before install so setup.py can detect NCCL version
           python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
           # Use --no-build-isolation so setup.py can access pre-installed PyTorch
@@ -148,7 +167,7 @@ jobs:
 
       - name: Unit tests (parallel)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           # Skip tests requiring unavailable hardware or known issues:
           # - nvme checkpointing: no nvme device
@@ -166,7 +185,7 @@ jobs:
 
       - name: Unit tests (sequential)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           rm -rf /mnt/aio/pytest
           pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
 
@@ -7,11 +7,10 @@
 
 - All commits MUST have a `Signed-off-by` line (use `--signoff`). Get the name and email from `git config user.name` / `git config user.email`.
 - Formatting: yapf (column_limit=119, `.style.yapf`) + flake8 (`.flake8`).
-- Always verify changed files pass pre-commit checks before committing. Config: `.pre-commit-config.yaml`.
+- Always verify changed files pass pre-commit checks before committing: `pre-commit run --files <changed_files>`. Only check modified files, not the entire codebase. Config: `.pre-commit-config.yaml`.
 - `check-torchdist` hook: NEVER directly import torch's distributed module. Use `import deepspeed.comm as dist` instead.
 - New files require license header:
   ```
-  # Copyright (c) Microsoft Corporation.
   # SPDX-License-Identifier: Apache-2.0
   # DeepSpeed Team
   ```
 
@@ -7,11 +7,10 @@
 
 - All commits MUST have a `Signed-off-by` line (use `--signoff`). Get the name and email from `git config user.name` / `git config user.email`.
 - Formatting: yapf (column_limit=119, `.style.yapf`) + flake8 (`.flake8`).
-- Always verify changed files pass pre-commit checks before committing. Config: `.pre-commit-config.yaml`.
+- Always verify changed files pass pre-commit checks before committing: `pre-commit run --files <changed_files>`. Only check modified files, not the entire codebase. Config: `.pre-commit-config.yaml`.
 - `check-torchdist` hook: NEVER directly import torch's distributed module. Use `import deepspeed.comm as dist` instead.
 - New files require license header:
   ```
-  # Copyright (c) Microsoft Corporation.
   # SPDX-License-Identifier: Apache-2.0
   # DeepSpeed Team
   ```
 
@@ -16,6 +16,8 @@
 
 ## Latest News
 
+* [2026/03] DeepSpeed Team gave a tutorial at ASPLOS 2026 titled ["Building Efficient Large-Scale Model Systems with DeepSpeed: From Open-Source Foundations to Emerging Research" ](https://supercomputing-system-ai-lab.github.io/events/asplos2026-llm-tutorial/index.html)
+
 * [2026/03] [Our SuperOffload work received an Honorable Mention for the ASPLOS 2026 Best Paper Award](https://dl.acm.org/doi/10.1145/3760250.3762217)
 
 * [2025/12] [DeepSpeed Core API updates: PyTorch-style backward and low-precision master states](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/core_api_update/README.md)
 
@@ -3,8 +3,11 @@
 
 # DeepSpeed Team
 
+from typing import List, Optional, Literal
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
+PassName = Literal["z1", "z3", "autosp"]
+
 
 class CompileConfig(DeepSpeedConfigModel):
     """ Configure compile settings """
@@ -53,3 +56,6 @@ class CompileConfig(DeepSpeedConfigModel):
 
     keep_all_input_tensors: bool = False
     """ Keep real values for all input tensors in InputStorage instead of using dummy values """
+
+    passes: Optional[List[PassName]] = None
+    """ Composes different optimizations. """
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+#########################################
+# AUTOSP
+#########################################
+AUTOSP_INPUT_ID_KEY = "input_id"
+AUTOSP_LABEL_ID_KEY = "label_id"
+AUTOSP_POSITION_ID_KEY = "position_id"
@@ -0,0 +1,9 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .all_to_all import all_to_all
+from . import sp_dp_registry
+
+__all__ = ["all_to_all", "sp_dp_registry", "sp_compat"]
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed.comm as dist
+from torch.utils._sympy.functions import FloorDiv
+from .sp_dp_registry import get_group, is_setup, sp_size
+
+
+@torch.library.custom_op("autosp::all_to_all", mutates_args=())
+def all_to_all(
+    input: torch.Tensor,
+    scatter_idx: int,
+    gather_idx: int,
+    name: str,
+) -> torch.Tensor:
+    """
+    All-to-all collective for SDPA tensors [B, N, S, H].
+
+    For QKV (scatter_idx=1, gather_idx=2):
+        [B, N, S/P, H] -> [B, N/P, S, H]
+    For O (scatter_idx=2, gather_idx=1):
+        [B, N/P, S, H] -> [B, N, S/P, H]
+    """
+    assert is_setup(), 'Incorrect initialization of SP/DP mesh.'
+    B, dim1, dim2, H = input.shape
+    gid = dist.get_rank() // sp_size()
+    group = get_group(gid)
+
+    if scatter_idx == 1:
+        N, local_S = dim1, dim2
+        input_t = input.reshape(B, sp_size(), N // sp_size(), local_S, H)
+        input_t = input_t.permute(1, 0, 2, 3, 4).contiguous()
+
+        output = torch.empty_like(input_t)
+        dist.all_to_all_single(output, input_t, group=group)
+
+        output = output.permute(1, 2, 0, 3, 4).contiguous()
+        output = output.reshape(B, N // sp_size(), sp_size() * local_S, H)
+    else:
+        local_N, S = dim1, dim2
+        input_t = input.reshape(B, local_N, sp_size(), S // sp_size(), H)
+        input_t = input_t.permute(2, 0, 1, 3, 4).contiguous()
+
+        output = torch.empty_like(input_t)
+        dist.all_to_all_single(output, input_t, group=group)
+
+        output = output.permute(1, 0, 2, 3, 4).contiguous()
+        output = output.reshape(B, sp_size() * local_N, S // sp_size(), H)
+
+    return output
+
+
+@torch.library.register_fake("autosp::all_to_all")
+def all_to_all_fake(input: torch.Tensor, scatter_idx: int, gather_idx: int, name: str):
+
+    def maybe_restore_sharded_dim(dim: torch.SymInt, factor: int):
+        # Torch 2.9 may keep `P * (s // P)` distinct from the original `s` during
+        # fake shape propagation. When the local dim is exactly `FloorDiv(s, P)`,
+        # restore the original symbol so downstream ops see a consistent sequence dim.
+        node = getattr(dim, "node", None)
+        if node is None:
+            return dim * factor
+
+        expr = node.expr
+        if isinstance(expr, FloorDiv) and expr.args[1] == factor:
+            hint = node.hint * factor if node.has_hint() else None
+            return node.shape_env.create_symintnode(expr.args[0], hint=hint)
+
+        return dim * factor
+
+    B, dim1, dim2, H = input.shape
+    if scatter_idx == 1:
+        return input.new_empty(B, dim1 // sp_size(), maybe_restore_sharded_dim(dim2, sp_size()), H)
+    else:
+        return input.new_empty(B, dim1 * sp_size(), dim2 // sp_size(), H)
+
+
+def _all_to_all_backward_setup(ctx, inputs, output):
+    _, scatter_idx, gather_idx, name = inputs
+    ctx.scatter_idx = gather_idx
+    ctx.gather_idx = scatter_idx
+    ctx.name = name + "_grad"
+
+
+def _all_to_all_backward(ctx, grad):
+    return (all_to_all(grad, ctx.scatter_idx, ctx.gather_idx, ctx.name), None, None, None)
+
+
+torch.library.register_autograd("autosp::all_to_all", _all_to_all_backward, setup_context=_all_to_all_backward_setup)
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from packaging.version import Version
+
+
+def _check_autosp_compatibility():
+    # Strip the local version segment (e.g. +cu128) so CUDA builds don't sort
+    # above the max bound when using packaging's local-version ordering rules.
+    torch_version = Version(torch.__version__.split("+")[0])
+    if torch_version < Version("2.9"):
+        raise RuntimeError("AutoSP requires PyTorch >= 2.9, found "
+                           f"{torch.__version__}.")
+
+    try:
+        import transformers
+        if Version(transformers.__version__) > Version("4.50.3"):
+            raise RuntimeError("AutoSP requires transformers <= 4.50.3, found "
+                               f"{transformers.__version__}.")
+    except ImportError:
+        pass  # transformers not installed; skip the check
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed.comm as dist
+
+GROUP_REGISTRY = {}  # int -> dist.ProcessGroup
+
+
+def register_groups(groups):
+    """groups: List[List[int]], e.g. [[0,1],[2,3]]"""
+    for gid, ranks in enumerate(groups):
+        if gid not in GROUP_REGISTRY:
+            GROUP_REGISTRY[gid] = dist.new_group(ranks)
+
+
+def get_group(gid: int):
+    return GROUP_REGISTRY[gid] if gid is not None else dist.get_world_group()
+
+
+def get_registry():
+    return GROUP_REGISTRY
+
+
+def is_setup():
+    return GROUP_REGISTRY['is_reg'] if 'is_reg' in GROUP_REGISTRY else False
+
+
+def extract_mesh_size(param_dict):
+    sp_size = param_dict.get('sequence_parallel_size', 1)
+    assert dist.get_world_size() % sp_size == 0, 'World mesh-size should be divisible by SP_SIZE'
+    dp_size = dist.get_world_size() // sp_size
+
+    return sp_size, dp_size
+
+
+def sp_size():
+    assert 'SP_SIZE' in GROUP_REGISTRY, 'SP_SIZE not init properly.'
+
+    return GROUP_REGISTRY['SP_SIZE']
+
+
+def dp_size():
+    assert 'DP_SIZE' in GROUP_REGISTRY, 'DP_SIZE not init properly'
+
+    return GROUP_REGISTRY['DP_SIZE']
+
+
+def populate_registry(SP_SIZE, DP_SIZE):
+    """ Populate rank to SP/DP mesh index.  """
+
+    if GROUP_REGISTRY.get('is_reg', False):
+        return
+
+    group_listing = []
+    offset = 0
+    for _ in range(DP_SIZE):
+        group_listing.append([i + offset for i in range(SP_SIZE)])
+        offset += SP_SIZE
+
+    register_groups(group_listing)
+
+    ## Extraneous metadata required for proper instatiation. ##
+    GROUP_REGISTRY['SP_SIZE'] = SP_SIZE
+    GROUP_REGISTRY['DP_SIZE'] = DP_SIZE
+    GROUP_REGISTRY['is_reg'] = True