NVSHMEM initialization test code (#17)

szaman19 · web-flow · commit 3d19f5ee1f60 · 2025-05-07T11:32:06.000-04:00
* Experiments running after build system change

* Adding init test code to run with torchrun hpc

* Update memory allocation to allow for non-NVLink transport for inter-node NVSHMEM

* Adding unit test to check NVSHMEM init matches with NCCL

* Adding readme for NVSHMEM init script
diff --git a/DGraph/distributed/nvshmem/NVSHMEMBackendEngine.py b/DGraph/distributed/nvshmem/NVSHMEMBackendEngine.py
@@ -26,9 +26,13 @@ def _nvshmmem_gather(send_tensor, indices, rank_mappings):
     num_output_rows = indices.shape[1]
     num_features = send_tensor.shape[2]
 
-    gathered_tensor = torch.zeros((bs, num_output_rows, num_features)).to(
-        send_tensor.device
+    num_elem = bs * num_output_rows * num_features
+    gathered_tensor = nvshmem.NVSHMEMP2P.allocate_symmetric_memory(
+        num_elem, send_tensor.device.index
     )
+    gathered_tensor.fill_(0).float()
+    gathered_tensor = gathered_tensor.reshape((bs, num_output_rows, num_features))
+
     # Gather the tensors
 
     # TODO: Add an option to cache the max value
diff --git a/experiments/NVSHMEM-Enabled-DGRAPH/NVSHMEM_init.py b/experiments/NVSHMEM-Enabled-DGRAPH/NVSHMEM_init.py
@@ -0,0 +1,74 @@
+import DGraph.Communicator as Comm
+import torch.distributed as dist
+import torch
+import DGraph.torch_nvshmem_p2p as nvshmem
+
+
+def main():
+    comm = Comm.Communicator.init_process_group("nvshmem")
+    rank = comm.get_rank()
+    world_size = comm.get_world_size()
+
+    dist.barrier()
+    assert dist.is_initialized(), "NCCL process group not initialized"
+    assert dist.get_backend() == "nccl", "NCCL process group not initialized"
+    assert dist.get_rank() == rank, "NCCL process group rank mismatch"
+    assert dist.get_world_size() == world_size, "NCCL process group world size mismatch"
+
+    dist.barrier()
+    for i in range(world_size):
+        if rank == i:
+            print(
+                f"Rank {rank} checking in. ",
+                f"Number of available GPUs: {torch.cuda.device_count()}",
+            )
+        dist.barrier()
+
+    # Set device for this process
+    local_rank = rank % torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+
+    # Allocate a tensor on the GPU
+    num_elements = world_size
+    nvshmem_tensor = nvshmem.NVSHMEMP2P.allocate_symmetric_memory(
+        num_elements, local_rank
+    )
+    nvshmem_tensor.fill_(rank).float()
+
+    dist.barrier()
+    for i in range(world_size):
+        if rank == i:
+            print(
+                f"Rank {rank}: ",
+                f"Tensor: {nvshmem_tensor}",
+            )
+        dist.barrier()
+    assert torch.allclose(
+        nvshmem_tensor, torch.full((num_elements,), rank).cuda().float()
+    ), "Tensor values do not match expected values"
+
+    indices = torch.arange(num_elements, dtype=torch.int64).cuda()
+    output_tensor = nvshmem.NVSHMEMP2P.allocate_symmetric_memory(
+        num_elements, local_rank
+    )
+    output_tensor.fill_(0).float()
+    ranks = torch.arange(world_size).cuda()
+    nvshmem.NVSHMEMP2P.dist_get(
+        nvshmem_tensor, output_tensor, indices, ranks, 1, num_elements, 1, num_elements
+    )
+    dist.barrier()
+    for i in range(world_size):
+        if rank == i:
+            print(
+                f"Rank {rank}: ",
+                f"Output Tensor: {output_tensor}",
+            )
+        dist.barrier()
+
+    assert torch.allclose(
+        output_tensor, torch.arange(world_size).cuda().float()
+    ), "Output tensor values do not match expected values"
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/NVSHMEM-Enabled-DGRAPH/README.md b/experiments/NVSHMEM-Enabled-DGRAPH/README.md
@@ -4,9 +4,7 @@ Enabling the NVSHMEM backend in DGraph requires MPI and NVSHMEM to be installed
 
 ## Pre-requisites
 
-DGraph must be built with NVSHMEM, MPI, and CUDA in order to use the NVSHMEM backend. The
-setup script will install the appropriate submodules but the dependencies must be installed
-and available on the system.
+DGraph must be built with NVSHMEM, MPI, and CUDA in order to use the NVSHMEM backend. The setup script will install the appropriate submodules but the dependencies must be installed and available on the system.
 
 DGraph searches for NVSHMEM, MPI, and CUDA based on the following environment variables:
 - `NVSHMEM_HOME`
@@ -40,3 +38,19 @@ NVSHMEM compilation information can be usually found by running the `nvshmem-inf
 $NVSHMEM_HOME/bin/nvshmem-info -b
 ```
 
+## Building DGraph with NVSHMEM
+To build DGraph with NVSHMEM, make sure the environment variables are set and run the following command:
+
+```shell
+pip install -e .
+```
+
+## Running DGraph with NVSHMEM
+
+DGraph builds on top of the PyTorch distributed package, so it is important to initialize the PyTorch distributed package before using DGraph, but also initialize MPI. Using a distributed launcher simplifies this process. We recommend using [`torchrun-hpc](https://github.com/lbann/HPC-launcher) 
+
+```shell
+torchrun-hpc -N<NUM_NODES> -n<NUM_PROCESSES_PER_NODE> NVSHMEM_init.py
+```
+
+The script assumes that the launcher starts the processes and `torch.dist` is initialized. If not using a launcher, you must initialize the PyTorch distributed package. 
diff --git a/tests/test_nvshmem_backend.py b/tests/test_nvshmem_backend.py
@@ -91,6 +91,21 @@ def test_nvshmem_backend_init(init_nvshmem_backend):
     print(f"Rank: {rank}")
 
 
+def test_nvshmem_backend_dist_init(init_nvshmem_backend):
+    # Check if the initialization of the NVSHMEM backend is correct
+    # and matches the NCCL backend in terms of rank and world size
+    comm = init_nvshmem_backend
+    rank = comm.get_rank()
+    world_size = comm.get_world_size()
+
+    if not dist.is_initialized():
+        print("NCCL process group not initialized, skipping test...")
+        return True
+    assert dist.is_initialized(), "NCCL process group not initialized"
+    assert dist.get_rank() == rank, "NCCL process group rank mismatch"
+    assert dist.get_world_size() == world_size, "NCCL process group world size mismatch"
+
+
 def test_nvshmem_backend_gather(init_nvshmem_backend, setup_gather_data):
     comm: Comm.Communicator = init_nvshmem_backend
     all_rank_input_data, all_edge_coo, rank_mappings, all_rank_output = (