From 14fd212fa6ed76fa541f2e4d405675d6dc61e353 Mon Sep 17 00:00:00 2001
From: Sai Vishal Pothula <spothula@nvidia.com>
Date: Fri, 27 Feb 2026 04:40:02 +0200
Subject: [PATCH 01/16] Initial implementation of symmetric memory backend for
 PyTorch

---
 csrc/multidevice/communicator.cpp             |  16 ++
 csrc/multidevice/communicator.h               |  13 ++
 csrc/multidevice/ipc_utils.cpp                |  18 ++
 csrc/multidevice/ipc_utils.h                  |  13 ++
 csrc/multidevice/symmetric_tensor.cpp         | 163 +++++++++++++++++-
 csrc/multidevice/symmetric_tensor.h           |  21 ++-
 csrc/options.cpp                              |   1 +
 csrc/options.h                                |   2 +
 fbuild.sh                                     |  24 +++
 .../cpp/test_multidevice_symmetric_tensor.cpp | 108 ++++++++++++
 10 files changed, 372 insertions(+), 7 deletions(-)
 create mode 100755 fbuild.sh
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index dbd65ba4610..7531f7ff186 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -424,4 +424,20 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
+#ifdef NVFUSER_DISTRIBUTED
+c10::intrusive_ptr<c10d::Store> Communicator::getStore() const {
+  return c10::intrusive_ptr<c10d::Store>(store_);
+}
+
+c10::intrusive_ptr<c10d::Backend> Communicator::getWorldBackendIntrusivePtr(
+    std::optional<CommunicatorBackend> backend) {
+  std::vector<RankType> all_ranks(size_);
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  CommunicatorBackend b = backend.value_or(default_backend_);
+  std::string team_key = getTeamKey(all_ranks, b);
+  (void)getBackendForTeam(all_ranks, backend, "");
+  return backends_.at(team_key);
+}
+#endif
+
 } // namespace nvfuser
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index b56e6fee3aa..45a7f9e4f4f 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -13,6 +13,7 @@
 
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #else
@@ -124,6 +125,18 @@ class NVF_API Communicator {
     return store_.get();
   }
 
+#ifdef NVFUSER_DISTRIBUTED
+  // Returns the store as an intrusive_ptr for use with PyTorch symmetric
+  // memory (c10d::symmetric_memory::set_group_info).
+  c10::intrusive_ptr<c10d::Store> getStore() const;
+
+  // Returns the world backend as an intrusive_ptr so it can be registered with
+  // c10d::register_process_group (e.g. for PyTorch symmetric memory NCCL
+  // rendezvous, which resolves the group by name).
+  c10::intrusive_ptr<c10d::Backend> getWorldBackendIntrusivePtr(
+      std::optional<CommunicatorBackend> backend = std::nullopt);
+#endif
+
  private:
   Communicator(
       CommunicatorBackend backend = comm_backend_default,
diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp
index 656b4ee5e24..01bdf949044 100644
--- a/csrc/multidevice/ipc_utils.cpp
+++ b/csrc/multidevice/ipc_utils.cpp
@@ -191,4 +191,22 @@ MulticastProtocol getMulticastProtocol() {
   return MulticastProtocol::BatchMemcpy;
 }
 
+SymmetricMemoryBackend getSymmetricMemoryBackend() {
+  if (isOptionEnabled(EnableOption::SymmetricMemoryBackend)) {
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_nccl")) {
+      return SymmetricMemoryBackend::PyTorchNccl;
+    }
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_nvshmem")) {
+      return SymmetricMemoryBackend::PyTorchNvshmem;
+    }
+    if (hasEnableOptionArgument(
+            EnableOption::SymmetricMemoryBackend, "pytorch_cuda")) {
+      return SymmetricMemoryBackend::PyTorchCuda;
+    }
+  }
+  return SymmetricMemoryBackend::Native;
+}
+
 } // namespace nvfuser
diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h
index bac466d74f8..0cfd6586e47 100644
--- a/csrc/multidevice/ipc_utils.h
+++ b/csrc/multidevice/ipc_utils.h
@@ -33,6 +33,19 @@ enum class MulticastProtocol { Memcpy, Multimem, BatchMemcpy };
 
 MulticastProtocol getMulticastProtocol();
 
+// Backend for symmetric memory allocation and rendezvous.
+// Native: Fuser's own CUDA VMM + IPC implementation (default, maintained).
+// PyTorch*: Use PyTorch's symmetric memory (torch.distributed._symmetric_memory)
+// with the given transport backend (Nccl, Nvshmem, or Cuda).
+enum class SymmetricMemoryBackend {
+  Native,
+  PyTorchNccl,
+  PyTorchNvshmem,
+  PyTorchCuda,
+};
+
+SymmetricMemoryBackend getSymmetricMemoryBackend();
+
 // Creates a listening Unix domain socket bound to path.
 // If path starts with '@', it uses the abstract namespace (replaced with \0).
 // Returns the socket file descriptor.
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 902ec5a3a32..6f8c3330515 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -7,6 +7,7 @@
 // clang-format on
 #include "multidevice/symmetric_tensor.h"
 
+#include <mutex>
 #include <numeric>
 
 #include "cuda_utils.h"
@@ -15,10 +16,63 @@
 #include "multidevice/ipc_utils.h"
 #include "multidevice/utils.h"
 
+#ifdef NVFUSER_DISTRIBUTED
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#endif
+
 namespace nvfuser {
 
 namespace {
 
+#ifdef NVFUSER_DISTRIBUTED
+const char* kPyTorchSymmMemGroupName = "nvfuser_symm";
+
+// Cache: tensor storage data ptr -> PyTorch SymmetricMemory handle from
+// rendezvous. Used so SymmetricTensor(tensor) can recover the handle.
+std::unordered_map<void*, c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>>&
+getPySymmHandleCache() {
+  static std::unordered_map<
+      void*,
+      c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>>
+      cache;
+  return cache;
+}
+
+std::mutex& getPySymmHandleCacheMutex() {
+  static std::mutex m;
+  return m;
+}
+
+void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
+  static std::once_flag once;
+  std::call_once(once, [backend]() {
+    const char* name = nullptr;
+    switch (backend) {
+      case SymmetricMemoryBackend::PyTorchNccl:
+        name = "NCCL";
+        break;
+      case SymmetricMemoryBackend::PyTorchNvshmem:
+        name = "NVSHMEM";
+        break;
+      case SymmetricMemoryBackend::PyTorchCuda:
+        name = "CUDA";
+        break;
+      default:
+        NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
+    }
+    c10d::symmetric_memory::set_backend(name);
+    Communicator& comm = Communicator::getInstance();
+    NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory");
+    c10d::symmetric_memory::set_group_info(
+        kPyTorchSymmMemGroupName,
+        static_cast<int>(comm.deviceId()),
+        static_cast<int>(comm.size()),
+        comm.getStore());
+  });
+}
+#endif
+
+
 // Returns the allocation granularity for symmetric memory.
 // - query_mcast_granularity: if true, considers multicast granularity
 // - query_mcast_recommended_granularity: if true, uses recommended (larger)
@@ -88,6 +142,48 @@ at::Tensor SymmetricTensor::allocate(
     at::IntArrayRef sizes,
     at::ScalarType dtype,
     at::Device device) {
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+
+#ifdef NVFUSER_DISTRIBUTED
+  if (backend != SymmetricMemoryBackend::Native) {
+    ensurePyTorchSymmMemBackend(backend);
+    std::vector<int64_t> strides(sizes.size());
+    strides.back() = 1;
+    for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * sizes[i + 1];
+    }
+    // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name.
+    c10::optional<std::string> alloc_group_name =
+        (backend == SymmetricMemoryBackend::PyTorchNccl)
+        ? c10::nullopt
+        : c10::optional<std::string>(kPyTorchSymmMemGroupName);
+    at::Tensor tensor = c10d::symmetric_memory::empty_strided_p2p(
+        sizes,
+        strides,
+        dtype,
+        device,
+        alloc_group_name,
+        c10::nullopt);
+    c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> handle =
+        c10d::symmetric_memory::rendezvous(
+            tensor, c10::optional<std::string>(kPyTorchSymmMemGroupName));
+    void* key = tensor.storage().data_ptr().get();
+    {
+      std::lock_guard<std::mutex> lock(getPySymmHandleCacheMutex());
+      getPySymmHandleCache()[key] = handle;
+    }
+    return tensor;
+  }
+#else
+  if (backend != SymmetricMemoryBackend::Native) {
+    NVF_ERROR(
+        false,
+        "PyTorch symmetric memory backend requires a build with "
+        "NVFUSER_DISTRIBUTED. Use NVFUSER_ENABLE=symmetric_memory_backend(native) "
+        "or do not set symmetric_memory_backend.");
+  }
+#endif
+
   int is_vmm_supported;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
       &is_vmm_supported,
@@ -212,6 +308,28 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
       "Expected CUDA tensor, got: ",
       local_tensor.device());
 
+#ifdef NVFUSER_DISTRIBUTED
+  {
+    void* key = local_tensor.storage().data_ptr().get();
+    std::lock_guard<std::mutex> lock(getPySymmHandleCacheMutex());
+    auto& cache = getPySymmHandleCache();
+    auto it = cache.find(key);
+    if (it != cache.end()) {
+      py_symm_handle_ = std::move(it->second);
+      cache.erase(it);
+      world_size_ = py_symm_handle_->get_world_size();
+      my_device_id_ = py_symm_handle_->get_rank();
+      requested_size_ = local_tensor.numel() * local_tensor.element_size();
+      are_remote_tensors_setup_ = true; // PyTorch rendezvous already set up
+      if (py_symm_handle_->has_multicast_support()) {
+        is_multicast_setup_ = true;
+        mc_ptr_ = py_symm_handle_->get_multicast_ptr();
+      }
+      return;
+    }
+  }
+#endif
+
   std::string error = SymmetricTensor::validate(local_tensor);
   NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error);
 
@@ -253,6 +371,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 }
 
 SymmetricTensor::~SymmetricTensor() {
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    return; // PyTorch backend: no native VMM cleanup
+  }
+#endif
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     if (mc_base_ptr_) {
@@ -302,6 +425,11 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   if (are_remote_tensors_setup_ == true) {
     return;
   }
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    return; // PyTorch backend: rendezvous already established remote access
+  }
+#endif
   Communicator& comm = Communicator::getInstance();
   CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_];
   CUdeviceptr local_ptr = remote_ptrs_[my_device_id_];
@@ -379,6 +507,13 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
     return local_tensor_;
   }
 
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    return py_symm_handle_->get_remote_tensor(
+        rank, local_tensor_.sizes(), local_tensor_.scalar_type());
+  }
+#endif
+
   NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
   return at::from_blob(
       reinterpret_cast<void*>(remote_ptrs_[rank]),
@@ -390,6 +525,13 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 }
 
 void* SymmetricTensor::multicastPtr() const {
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    return py_symm_handle_->has_multicast_support()
+        ? py_symm_handle_->get_multicast_ptr()
+        : nullptr;
+  }
+#endif
   NVF_CHECK(is_multicast_setup_, "Multicast not setup");
   return mc_ptr_;
 }
@@ -398,7 +540,14 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   if (is_contiguous_view_setup_) {
     return;
   }
-
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    NVF_ERROR(
+        false,
+        "Contiguous view is not yet supported for PyTorch symmetric memory backend. "
+        "Use native backend for SymmetricContiguousView.");
+  }
+#endif
   NVF_CHECK(
       are_remote_tensors_setup_ == true,
       "Remote tensors must be setup before setupContiguousView");
@@ -462,6 +611,13 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 }
 
 at::Tensor SymmetricTensor::getContiguousView() const {
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    NVF_ERROR(
+        false,
+        "Contiguous view is not yet supported for PyTorch symmetric memory backend.");
+  }
+#endif
   NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup");
   return contiguous_view_;
 }
@@ -469,6 +625,11 @@ at::Tensor SymmetricTensor::getContiguousView() const {
 void SymmetricTensor::setupMulticast(
     int64_t exporter_rank,
     const std::string& tag) {
+#ifdef NVFUSER_DISTRIBUTED
+  if (py_symm_handle_) {
+    return; // PyTorch backend: multicast handled by backend if supported
+  }
+#endif
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     return;
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index 5608153e0ce..c928a7d5469 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -10,6 +10,10 @@
 #include <ATen/core/Tensor.h>
 #include <cuda.h>
 
+#ifdef NVFUSER_DISTRIBUTED
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#endif
+
 namespace nvfuser {
 
 // SymmetricTensor wraps a local symmetric memory allocation and enables:
@@ -18,13 +22,14 @@ namespace nvfuser {
 // - Contiguous view creation across all ranks
 //
 // Design: Decouples local allocation from IPC handle exchange for better
-// interoperability and support for pre-allocated user buffers
+// interoperability and support for pre-allocated user buffers.
 //
-// TODO: Long term plan is to integrate pytorch's native symmetric memory as a
-// possible backend. One important reason to use pytorch's allocator is to use
-// pytorch's memory pool to let the framework own the memory stack and not
-// further fragment the memory. On the other hand, having our own implementation
-// allows us to experiment more advanced features like contigous view creation.
+// Backends (see SymmetricMemoryBackend in ipc_utils.h):
+// - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained.
+// - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory
+//   (torch.distributed._symmetric_memory) with the chosen transport backend.
+//   Select via NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
+//   Native remains the default when the option is not set.
 class SymmetricTensor {
  public:
   // Wrap pre-allocated symmetric tensor (must use allocate())
@@ -79,6 +84,10 @@ class SymmetricTensor {
   int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
+#ifdef NVFUSER_DISTRIBUTED
+  // When set, remote/multicast APIs delegate to PyTorch symmetric memory.
+  c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> py_symm_handle_;
+#endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/options.cpp b/csrc/options.cpp
index 6d587e35afd..9197b43ee3e 100644
--- a/csrc/options.cpp
+++ b/csrc/options.cpp
@@ -183,6 +183,7 @@ const std::unordered_map<std::string, EnableOption>& getEnableOptions() {
           {"fast_math", EnableOption::FastMath},
           {"p2p_protocol", EnableOption::P2pProtocol},
           {"multicast_protocol", EnableOption::MulticastProtocol},
+          {"symmetric_memory_backend", EnableOption::SymmetricMemoryBackend},
           {"parallel_serde", EnableOption::ParallelSerde},
       };
   return available_options;
diff --git a/csrc/options.h b/csrc/options.h
index 4c72c757460..bda66b1f526 100644
--- a/csrc/options.h
+++ b/csrc/options.h
@@ -128,6 +128,8 @@ enum class EnableOption {
   P2pProtocol, //! Prescribe P2P protocol: put|get
   MulticastProtocol, //! Prescribe multicast protocol:
                      //! memcpy|multimem|batch_memcpy
+  SymmetricMemoryBackend, //! Prescribe symmetric memory backend:
+                          //! native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda
   ParallelSerde, //! Enable deserializing FusionExecutorCache in parallel
   EndOfOption //! Placeholder for counting the number of elements
 };
diff --git a/fbuild.sh b/fbuild.sh
new file mode 100755
index 00000000000..e16a2e2cdd9
--- /dev/null
+++ b/fbuild.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+export CC=clang-20
+export CXX=clang++-20
+export LDFLAGS="-fuse-ld=mold"
+
+export NVFUSER_BUILD_ENABLE_PCH
+
+export UCC_HOME="/opt/hpcx/ucc"
+export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc"
+export UCX_HOME="/opt/hpcx/ucx"
+export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx"
+
+# export TORCH_CUDA_ARCH_LIST="9.0"
+
+export NVFUSER_BUILD_WITH_UCC=1
+export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser
+export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY
+
+# Enable debug mode, leave empty for non-debug compilation
+export NVFUSER_BUILD_BUILD_TYPE=Debug
+export RUN_CMAKE=""
+
+pip install -v -e ./python --no-build-isolation
diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp
index 2e4b5e66767..98af2bd6fbd 100644
--- a/tests/cpp/test_multidevice_symmetric_tensor.cpp
+++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp
@@ -5,6 +5,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include "multidevice/ipc_utils.h"
 #include "multidevice/symmetric_tensor.h"
 #include "tests/cpp/multidevice.h"
 
@@ -12,6 +13,68 @@ namespace nvfuser {
 
 using SymmetricTensorTest = MultiDeviceTest;
 
+// -----------------------------------------------------------------------------
+// Symmetric memory backend and option tests
+// -----------------------------------------------------------------------------
+
+TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) {
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+  EXPECT_TRUE(
+      backend == SymmetricMemoryBackend::Native ||
+      backend == SymmetricMemoryBackend::PyTorchNccl ||
+      backend == SymmetricMemoryBackend::PyTorchNvshmem ||
+      backend == SymmetricMemoryBackend::PyTorchCuda)
+      << "getSymmetricMemoryBackend() returned an invalid backend value";
+}
+
+TEST_F(SymmetricTensorTest, Validate_RejectsNormalCudaTensor) {
+  if (communicator_->size() == 1) {
+    GTEST_SKIP() << "Skipping test for single device (Native allocate needs VMM)";
+  }
+  // With Native backend, allocate() uses VMM; with PyTorch backend we skip
+  // because validate() is only used for Native path (PyTorch tensors come from
+  // cache in constructor).
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    GTEST_SKIP() << "Validate test applies to Native backend only";
+  }
+  // Allocate a normal (non-symmetric) CUDA tensor
+  at::Tensor normal_tensor = at::empty(
+      {64, 64},
+      at::TensorOptions().dtype(at::kFloat).device(communicator_->device()));
+  std::string error = SymmetricTensor::validate(normal_tensor);
+  EXPECT_FALSE(error.empty())
+      << "SymmetricTensor::validate() should reject a normal CUDA tensor";
+}
+
+TEST_F(SymmetricTensorTest, Validate_AcceptsSymmetricAllocation) {
+  if (communicator_->size() == 1) {
+    GTEST_SKIP() << "Skipping test for single device";
+  }
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    GTEST_SKIP() << "validate() for allocate() output is defined for Native backend only";
+  }
+  at::Tensor sym_tensor = SymmetricTensor::allocate(
+      {128, 128}, at::ScalarType::Float, communicator_->device());
+  std::string error = SymmetricTensor::validate(sym_tensor);
+  EXPECT_TRUE(error.empty())
+      << "SymmetricTensor::validate() should accept tensor from allocate(); got: "
+      << error;
+}
+
+TEST_F(SymmetricTensorTest, Constructor_ThrowsOnInvalidTensor) {
+  at::Tensor normal_tensor = at::empty(
+      {8, 8},
+      at::TensorOptions().dtype(at::kFloat).device(communicator_->device()));
+  EXPECT_THROW(
+      { SymmetricTensor sym_tensor(normal_tensor); },
+      c10::Error)
+      << "SymmetricTensor constructor should throw when given a non-symmetric tensor";
+}
+
+// -----------------------------------------------------------------------------
+// Backend-agnostic and Native backend correctness (allocate + remote access)
+// -----------------------------------------------------------------------------
+
 TEST_F(SymmetricTensorTest, BasicAllocation) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
@@ -54,6 +117,51 @@ TEST_F(SymmetricTensorTest, BasicAllocation) {
   }
 }
 
+// Same remote-access correctness as BasicAllocation but only runs when
+// PyTorch symmetric memory backend is selected (NVFUSER_ENABLE=
+// symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda)).
+// Run with e.g. NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to
+// exercise the PyTorch path.
+TEST_F(SymmetricTensorTest, PyTorchBackend_RemoteAccessCorrectness) {
+  if (communicator_->size() == 1) {
+    GTEST_SKIP() << "Skipping test for single device";
+  }
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+  if (backend == SymmetricMemoryBackend::Native) {
+    GTEST_SKIP()
+        << "PyTorch backend not selected; set NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to run";
+  }
+
+  const int64_t rank = communicator_->deviceId();
+  const int64_t world_size = communicator_->size();
+
+  at::Tensor local_tensor = SymmetricTensor::allocate(
+      {256, 512}, at::ScalarType::Float, communicator_->device());
+  SymmetricTensor sym_tensor(local_tensor);
+
+  EXPECT_TRUE(local_tensor.is_cuda());
+  EXPECT_EQ(local_tensor.numel(), 256 * 512);
+
+  float local_value = static_cast<float>(rank + 200);
+  local_tensor.fill_(local_value);
+
+  sym_tensor.setupRemoteHandles();
+
+  for (int64_t peer_rank = 0; peer_rank < world_size; ++peer_rank) {
+    void* peer_ptr = sym_tensor.remoteTensor(peer_rank).data_ptr();
+    EXPECT_NE(peer_ptr, nullptr);
+
+    float peer_value;
+    NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
+        &peer_value, peer_ptr, sizeof(float), cudaMemcpyDeviceToHost));
+
+    float expected_value = static_cast<float>(peer_rank + 200);
+    EXPECT_FLOAT_EQ(peer_value, expected_value)
+        << "Rank " << rank << " reading from rank " << peer_rank
+        << " (PyTorch backend)";
+  }
+}
+
 TEST_F(SymmetricTensorTest, PreallocatedTensor) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";

From 49d669c3b237a1b78949371f3f9133c6bc9167ec Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Mon, 9 Mar 2026 11:19:58 +0200
Subject: [PATCH 02/16] Initial review comments

---
 csrc/multidevice/communicator.cpp             |  1 +
 csrc/multidevice/symmetric_tensor.cpp         | 71 +++++++------------
 .../cpp/test_multidevice_symmetric_tensor.cpp | 48 -------------
 3 files changed, 25 insertions(+), 95 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 7531f7ff186..85fc15ae3a5 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -441,3 +441,4 @@ c10::intrusive_ptr<c10d::Backend> Communicator::getWorldBackendIntrusivePtr(
 #endif
 
 } // namespace nvfuser
+
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 6f8c3330515..b3e1ca49c8e 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -7,7 +7,6 @@
 // clang-format on
 #include "multidevice/symmetric_tensor.h"
 
-#include <mutex>
 #include <numeric>
 
 #include "cuda_utils.h"
@@ -18,6 +17,7 @@
 
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #endif
 
 namespace nvfuser {
@@ -27,22 +27,6 @@ namespace {
 #ifdef NVFUSER_DISTRIBUTED
 const char* kPyTorchSymmMemGroupName = "nvfuser_symm";
 
-// Cache: tensor storage data ptr -> PyTorch SymmetricMemory handle from
-// rendezvous. Used so SymmetricTensor(tensor) can recover the handle.
-std::unordered_map<void*, c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>>&
-getPySymmHandleCache() {
-  static std::unordered_map<
-      void*,
-      c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>>
-      cache;
-  return cache;
-}
-
-std::mutex& getPySymmHandleCacheMutex() {
-  static std::mutex m;
-  return m;
-}
-
 void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
   static std::once_flag once;
   std::call_once(once, [backend]() {
@@ -68,6 +52,9 @@ void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
         static_cast<int>(comm.deviceId()),
         static_cast<int>(comm.size()),
         comm.getStore());
+    // c10d::register_process_group(
+    //   kPyTorchSymmMemGroupName,
+    //   comm.getWorldBackendIntrusivePtr(CommunicatorBackend::kNccl));
   });
 }
 #endif
@@ -157,22 +144,13 @@ at::Tensor SymmetricTensor::allocate(
         (backend == SymmetricMemoryBackend::PyTorchNccl)
         ? c10::nullopt
         : c10::optional<std::string>(kPyTorchSymmMemGroupName);
-    at::Tensor tensor = c10d::symmetric_memory::empty_strided_p2p(
+    return c10d::symmetric_memory::empty_strided_p2p(
         sizes,
         strides,
         dtype,
         device,
         alloc_group_name,
         c10::nullopt);
-    c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> handle =
-        c10d::symmetric_memory::rendezvous(
-            tensor, c10::optional<std::string>(kPyTorchSymmMemGroupName));
-    void* key = tensor.storage().data_ptr().get();
-    {
-      std::lock_guard<std::mutex> lock(getPySymmHandleCacheMutex());
-      getPySymmHandleCache()[key] = handle;
-    }
-    return tensor;
   }
 #else
   if (backend != SymmetricMemoryBackend::Native) {
@@ -309,24 +287,14 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
       local_tensor.device());
 
 #ifdef NVFUSER_DISTRIBUTED
-  {
-    void* key = local_tensor.storage().data_ptr().get();
-    std::lock_guard<std::mutex> lock(getPySymmHandleCacheMutex());
-    auto& cache = getPySymmHandleCache();
-    auto it = cache.find(key);
-    if (it != cache.end()) {
-      py_symm_handle_ = std::move(it->second);
-      cache.erase(it);
-      world_size_ = py_symm_handle_->get_world_size();
-      my_device_id_ = py_symm_handle_->get_rank();
-      requested_size_ = local_tensor.numel() * local_tensor.element_size();
-      are_remote_tensors_setup_ = true; // PyTorch rendezvous already set up
-      if (py_symm_handle_->has_multicast_support()) {
-        is_multicast_setup_ = true;
-        mc_ptr_ = py_symm_handle_->get_multicast_ptr();
-      }
-      return;
-    }
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+  if (backend != SymmetricMemoryBackend::Native) {
+    ensurePyTorchSymmMemBackend(backend);
+    Communicator& comm = Communicator::getInstance();
+    world_size_ = comm.size();
+    my_device_id_ = comm.deviceId();
+    requested_size_ = local_tensor.numel() * local_tensor.element_size();
+    return; // Rendezvous runs in setupRemoteHandles()
   }
 #endif
 
@@ -426,8 +394,17 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     return;
   }
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
-    return; // PyTorch backend: rendezvous already established remote access
+  // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles).
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    ensurePyTorchSymmMemBackend(getSymmetricMemoryBackend());
+    py_symm_handle_ = c10d::symmetric_memory::rendezvous(
+        local_tensor_, c10::optional<std::string>(kPyTorchSymmMemGroupName));
+    are_remote_tensors_setup_ = true;
+    if (py_symm_handle_->has_multicast_support()) {
+      is_multicast_setup_ = true;
+      mc_ptr_ = py_symm_handle_->get_multicast_ptr();
+    }
+    return;
   }
 #endif
   Communicator& comm = Communicator::getInstance();
diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp
index afb895ca938..19e213e6ab9 100644
--- a/tests/cpp/test_multidevice_symmetric_tensor.cpp
+++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp
@@ -27,54 +27,6 @@ TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) {
       << "getSymmetricMemoryBackend() returned an invalid backend value";
 }
 
-TEST_F(SymmetricTensorTest, Validate_RejectsNormalCudaTensor) {
-  if (communicator_->size() == 1) {
-    GTEST_SKIP() << "Skipping test for single device (Native allocate needs VMM)";
-  }
-  // With Native backend, allocate() uses VMM; with PyTorch backend we skip
-  // because validate() is only used for Native path (PyTorch tensors come from
-  // cache in constructor).
-  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
-    GTEST_SKIP() << "Validate test applies to Native backend only";
-  }
-  // Allocate a normal (non-symmetric) CUDA tensor
-  at::Tensor normal_tensor = at::empty(
-      {64, 64},
-      at::TensorOptions().dtype(at::kFloat).device(communicator_->device()));
-  std::string error = SymmetricTensor::validate(normal_tensor);
-  EXPECT_FALSE(error.empty())
-      << "SymmetricTensor::validate() should reject a normal CUDA tensor";
-}
-
-TEST_F(SymmetricTensorTest, Validate_AcceptsSymmetricAllocation) {
-  if (communicator_->size() == 1) {
-    GTEST_SKIP() << "Skipping test for single device";
-  }
-  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
-    GTEST_SKIP() << "validate() for allocate() output is defined for Native backend only";
-  }
-  at::Tensor sym_tensor = SymmetricTensor::allocate(
-      {128, 128}, at::ScalarType::Float, communicator_->device());
-  std::string error = SymmetricTensor::validate(sym_tensor);
-  EXPECT_TRUE(error.empty())
-      << "SymmetricTensor::validate() should accept tensor from allocate(); got: "
-      << error;
-}
-
-TEST_F(SymmetricTensorTest, Constructor_ThrowsOnInvalidTensor) {
-  at::Tensor normal_tensor = at::empty(
-      {8, 8},
-      at::TensorOptions().dtype(at::kFloat).device(communicator_->device()));
-  EXPECT_THROW(
-      { SymmetricTensor sym_tensor(normal_tensor); },
-      c10::Error)
-      << "SymmetricTensor constructor should throw when given a non-symmetric tensor";
-}
-
-// -----------------------------------------------------------------------------
-// Backend-agnostic and Native backend correctness (allocate + remote access)
-// -----------------------------------------------------------------------------
-
 TEST_F(SymmetricTensorTest, BasicAllocation) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";

From 8962475d15c45baec352b30f820d9c47955b9b88 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Mon, 16 Mar 2026 13:55:37 +0200
Subject: [PATCH 03/16] Alloc, rendezvous passing

---
 csrc/multidevice/communicator.cpp             |  55 +++++--
 csrc/multidevice/communicator.h               |  35 +++--
 csrc/multidevice/symmetric_tensor.cpp         | 144 +++++++++++-------
 csrc/multidevice/symmetric_tensor.h           |   5 +-
 .../cpp/test_multidevice_symmetric_tensor.cpp |  76 +++------
 5 files changed, 177 insertions(+), 138 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 85fc15ae3a5..b0642dc3f7c 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -14,7 +14,13 @@
 #include <numeric>
 
 #ifdef NVFUSER_DISTRIBUTED
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/exception.h>
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -362,6 +368,12 @@ void Communicator::cleanup() {
       pg_nccl->shutdown();
     }
   }
+#endif
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+  for (const auto& entry : process_groups_) {
+    c10d::unregister_process_group(entry.first);
+  }
+  process_groups_.clear();
 #endif
   backends_.clear();
 }
@@ -402,6 +414,28 @@ c10d::Backend* Communicator::getBackendForTeam(
     }();
 #else
     backends_[team_key] = nullptr;
+#endif
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+    std::optional<c10d::ProcessGroup::BackendType> pg_backend =
+    (b == CommunicatorBackend::kNccl)
+        ? std::optional<c10d::ProcessGroup::BackendType>(
+              c10d::ProcessGroup::BackendType::NCCL)
+        : std::nullopt;
+    if (backends_[team_key] != nullptr && pg_backend.has_value()) {
+      auto rank_it = std::find(team.begin(), team.end(), deviceId());
+      RankType team_rank = std::distance(team.begin(), rank_it);
+
+      auto pg = c10::make_intrusive<c10d::ProcessGroup>(
+          c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
+          team_rank,
+          static_cast<int>(team.size()));
+      pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]);
+      pg->setDefaultBackend(*pg_backend);
+      pg->setGroupName(team_key);
+
+      c10d::register_process_group(team_key, pg);
+      process_groups_[team_key] = std::move(pg);
+    }
 #endif
   }
   return backends_.at(team_key).get();
@@ -424,21 +458,14 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
-#ifdef NVFUSER_DISTRIBUTED
-c10::intrusive_ptr<c10d::Store> Communicator::getStore() const {
-  return c10::intrusive_ptr<c10d::Store>(store_);
-}
-
-c10::intrusive_ptr<c10d::Backend> Communicator::getWorldBackendIntrusivePtr(
-    std::optional<CommunicatorBackend> backend) {
-  std::vector<RankType> all_ranks(size_);
-  std::iota(all_ranks.begin(), all_ranks.end(), 0);
-  CommunicatorBackend b = backend.value_or(default_backend_);
-  std::string team_key = getTeamKey(all_ranks, b);
-  (void)getBackendForTeam(all_ranks, backend, "");
-  return backends_.at(team_key);
+std::string Communicator::getSymmMemGroupKey(
+  std::optional<CommunicatorBackend> backend) {
+std::vector<RankType> all_ranks(size_);
+std::iota(all_ranks.begin(), all_ranks.end(), 0);
+CommunicatorBackend b = backend.value_or(default_backend_);
+(void)getBackendForTeam(all_ranks, b, "symm_mem_");
+return getTeamKey(all_ranks, b);
 }
-#endif
 
 } // namespace nvfuser
 
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 45a7f9e4f4f..57d3d48a750 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -11,9 +11,19 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 
+#if defined(NVFUSER_DISTRIBUTED) && \
+    __has_include(<torch/csrc/distributed/c10d/GroupRegistry.hpp>) && \
+    __has_include(<torch/csrc/distributed/c10d/ProcessGroup.hpp>)
+#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 1
+#else
+#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 0
+#endif
+
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#include <torch/csrc/distributed/c10d/Store.hpp>
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#endif
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #else
@@ -111,6 +121,10 @@ class NVF_API Communicator {
   c10d::Backend* getWorld(
       std::optional<CommunicatorBackend> backend = std::nullopt);
 
+  // Returns the world process-group name for the given backend.
+  std::string getSymmMemGroupKey(
+    std::optional<CommunicatorBackend> backend = std::nullopt);
+
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
     if (backend == CommunicatorBackend::kUcc) {
@@ -125,17 +139,9 @@ class NVF_API Communicator {
     return store_.get();
   }
 
-#ifdef NVFUSER_DISTRIBUTED
-  // Returns the store as an intrusive_ptr for use with PyTorch symmetric
-  // memory (c10d::symmetric_memory::set_group_info).
-  c10::intrusive_ptr<c10d::Store> getStore() const;
-
-  // Returns the world backend as an intrusive_ptr so it can be registered with
-  // c10d::register_process_group (e.g. for PyTorch symmetric memory NCCL
-  // rendezvous, which resolves the group by name).
-  c10::intrusive_ptr<c10d::Backend> getWorldBackendIntrusivePtr(
-      std::optional<CommunicatorBackend> backend = std::nullopt);
-#endif
+  c10::intrusive_ptr<c10d::Store> getStore() const {
+    return c10::intrusive_ptr<c10d::Store>(store_);
+  }
 
  private:
   Communicator(
@@ -166,6 +172,11 @@ class NVF_API Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
+#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
+  // c10d process-group wrappers registered for symmetric-memory rendezvous.
+  std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
+      process_groups_;
+#endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index b3e1ca49c8e..7ca46e7a290 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -25,39 +25,66 @@ namespace nvfuser {
 namespace {
 
 #ifdef NVFUSER_DISTRIBUTED
-const char* kPyTorchSymmMemGroupName = "nvfuser_symm";
-
-void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
-  static std::once_flag once;
-  std::call_once(once, [backend]() {
-    const char* name = nullptr;
-    switch (backend) {
-      case SymmetricMemoryBackend::PyTorchNccl:
-        name = "NCCL";
-        break;
-      case SymmetricMemoryBackend::PyTorchNvshmem:
-        name = "NVSHMEM";
-        break;
-      case SymmetricMemoryBackend::PyTorchCuda:
-        name = "CUDA";
-        break;
-      default:
-        NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
-    }
-    c10d::symmetric_memory::set_backend(name);
+  std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
+    static std::once_flag once;
+    std::call_once(once, [backend]() {
+      const char* name = nullptr;
+      switch (backend) {
+        case SymmetricMemoryBackend::PyTorchNccl:
+          name = "NCCL";
+          break;
+        case SymmetricMemoryBackend::PyTorchNvshmem:
+          name = "NVSHMEM";
+          break;
+        case SymmetricMemoryBackend::PyTorchCuda:
+          name = "CUDA";
+          break;
+        default:
+          NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
+      }
+      c10d::symmetric_memory::set_backend(name);
+    });
+  
     Communicator& comm = Communicator::getInstance();
     NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory");
-    c10d::symmetric_memory::set_group_info(
-        kPyTorchSymmMemGroupName,
-        static_cast<int>(comm.deviceId()),
-        static_cast<int>(comm.size()),
-        comm.getStore());
-    // c10d::register_process_group(
-    //   kPyTorchSymmMemGroupName,
-    //   comm.getWorldBackendIntrusivePtr(CommunicatorBackend::kNccl));
-  });
-}
-#endif
+    // TODO: Remove after nccl version update
+    auto maybe_set_group_info = [&](const std::string& group_name) {
+      try {
+        c10d::symmetric_memory::set_group_info(
+            group_name,
+            static_cast<int>(comm.deviceId()),
+            static_cast<int>(comm.size()),
+            comm.getStore());
+      } catch (const c10::Error&) {
+        // already registered
+      }
+    };
+    // Always return a valid group name
+    if (backend == SymmetricMemoryBackend::PyTorchNccl) {
+      NVF_CHECK(
+          comm.isBackendAvailable(CommunicatorBackend::kNccl),
+          "NCCL backend is required for symmetric_memory_backend(nccl)");
+      
+      const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
+
+      // This build expects default group name "0" to be registered.
+      // TODO: Remove after nccl version update
+      static std::once_flag group_once;
+      std::call_once(group_once, [&]() {
+        maybe_set_group_info("0");
+        maybe_set_group_info(group_name);
+      });
+
+      comm.barrier(CommunicatorBackend::kNccl);
+      return group_name;
+    }
+  
+    NVF_ERROR(
+        false,
+        "No c10d backend available for symmetric memory rendezvous. "
+        "Expected NCCL or UCC process group.");
+  }
+#endif  
 
 
 // Returns the allocation granularity for symmetric memory.
@@ -133,7 +160,7 @@ at::Tensor SymmetricTensor::allocate(
 
 #ifdef NVFUSER_DISTRIBUTED
   if (backend != SymmetricMemoryBackend::Native) {
-    ensurePyTorchSymmMemBackend(backend);
+    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
     std::vector<int64_t> strides(sizes.size());
     strides.back() = 1;
     for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) {
@@ -143,7 +170,7 @@ at::Tensor SymmetricTensor::allocate(
     c10::optional<std::string> alloc_group_name =
         (backend == SymmetricMemoryBackend::PyTorchNccl)
         ? c10::nullopt
-        : c10::optional<std::string>(kPyTorchSymmMemGroupName);
+        : c10::optional<std::string>(group_name);
     return c10d::symmetric_memory::empty_strided_p2p(
         sizes,
         strides,
@@ -289,12 +316,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 #ifdef NVFUSER_DISTRIBUTED
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
-    ensurePyTorchSymmMemBackend(backend);
     Communicator& comm = Communicator::getInstance();
     world_size_ = comm.size();
     my_device_id_ = comm.deviceId();
     requested_size_ = local_tensor.numel() * local_tensor.element_size();
-    return; // Rendezvous runs in setupRemoteHandles()
+    return;
   }
 #endif
 
@@ -340,7 +366,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 
 SymmetricTensor::~SymmetricTensor() {
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
+  if (torch_symm_handle_) {
     return; // PyTorch backend: no native VMM cleanup
   }
 #endif
@@ -395,14 +421,15 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   }
 #ifdef NVFUSER_DISTRIBUTED
   // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles).
-  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
-    ensurePyTorchSymmMemBackend(getSymmetricMemoryBackend());
-    py_symm_handle_ = c10d::symmetric_memory::rendezvous(
-        local_tensor_, c10::optional<std::string>(kPyTorchSymmMemGroupName));
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+  if (backend != SymmetricMemoryBackend::Native) {
+    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+    torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
+        local_tensor_, group_name);
     are_remote_tensors_setup_ = true;
-    if (py_symm_handle_->has_multicast_support()) {
+    if (torch_symm_handle_->has_multicast_support()) {
       is_multicast_setup_ = true;
-      mc_ptr_ = py_symm_handle_->get_multicast_ptr();
+      mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
     }
     return;
   }
@@ -485,9 +512,24 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
   }
 
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
-    return py_symm_handle_->get_remote_tensor(
-        rank, local_tensor_.sizes(), local_tensor_.scalar_type());
+  if (torch_symm_handle_) {
+    return torch_symm_handle_->get_remote_tensor(
+      rank, local_tensor_.sizes(), local_tensor_.scalar_type());
+    // Tried below code for older NCCL build, but it was not working.
+    // const int64_t storage_offset = local_tensor_.storage_offset();
+    // auto* handle = torch_symm_handle_.get();
+    // auto peer_ptr = handle->get_buffer_ptrs();
+    // NVF_CHECK(
+    //   peer_ptr != nullptr,
+    //   "Cannot get buffer across nodes, my rank: ",
+    //   handle->get_rank(),
+    //   ", peer: ",
+    //   rank);
+    // std::cout << "symm_mem type: " << typeid(*handle).name() << std::endl;
+    // const int ws = handle->get_world_size();
+    // NVF_CHECK(ws > 0, "SymmetricMemory world_size is 0");
+    // return handle->get_buffer(
+    //     rank, local_tensor_.sizes(), local_tensor_.scalar_type(), storage_offset);
   }
 #endif
 
@@ -503,9 +545,9 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 
 void* SymmetricTensor::multicastPtr() const {
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
-    return py_symm_handle_->has_multicast_support()
-        ? py_symm_handle_->get_multicast_ptr()
+  if (torch_symm_handle_) {
+    return torch_symm_handle_->has_multicast_support()
+        ? torch_symm_handle_->get_multicast_ptr()
         : nullptr;
   }
 #endif
@@ -518,7 +560,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
     return;
   }
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
+  if (torch_symm_handle_) {
     NVF_ERROR(
         false,
         "Contiguous view is not yet supported for PyTorch symmetric memory backend. "
@@ -589,7 +631,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 
 at::Tensor SymmetricTensor::getContiguousView() const {
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
+  if (torch_symm_handle_) {
     NVF_ERROR(
         false,
         "Contiguous view is not yet supported for PyTorch symmetric memory backend.");
@@ -603,7 +645,7 @@ void SymmetricTensor::setupMulticast(
     int64_t exporter_rank,
     const std::string& tag) {
 #ifdef NVFUSER_DISTRIBUTED
-  if (py_symm_handle_) {
+  if (torch_symm_handle_) {
     return; // PyTorch backend: multicast handled by backend if supported
   }
 #endif
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index c928a7d5469..eedf3602846 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -28,7 +28,7 @@ namespace nvfuser {
 // - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained.
 // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory
 //   (torch.distributed._symmetric_memory) with the chosen transport backend.
-//   Select via NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
+//   Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
 //   Native remains the default when the option is not set.
 class SymmetricTensor {
  public:
@@ -85,8 +85,7 @@ class SymmetricTensor {
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
 #ifdef NVFUSER_DISTRIBUTED
-  // When set, remote/multicast APIs delegate to PyTorch symmetric memory.
-  c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> py_symm_handle_;
+  c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> torch_symm_handle_;
 #endif
 };
 
diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp
index 19e213e6ab9..397189541ad 100644
--- a/tests/cpp/test_multidevice_symmetric_tensor.cpp
+++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp
@@ -13,20 +13,6 @@ namespace nvfuser {
 
 using SymmetricTensorTest = MultiDeviceTest;
 
-// -----------------------------------------------------------------------------
-// Symmetric memory backend and option tests
-// -----------------------------------------------------------------------------
-
-TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) {
-  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
-  EXPECT_TRUE(
-      backend == SymmetricMemoryBackend::Native ||
-      backend == SymmetricMemoryBackend::PyTorchNccl ||
-      backend == SymmetricMemoryBackend::PyTorchNvshmem ||
-      backend == SymmetricMemoryBackend::PyTorchCuda)
-      << "getSymmetricMemoryBackend() returned an invalid backend value";
-}
-
 TEST_F(SymmetricTensorTest, BasicAllocation) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
@@ -69,50 +55,24 @@ TEST_F(SymmetricTensorTest, BasicAllocation) {
   }
 }
 
-// Same remote-access correctness as BasicAllocation but only runs when
-// PyTorch symmetric memory backend is selected (NVFUSER_ENABLE=
-// symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda)).
-// Run with e.g. NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to
-// exercise the PyTorch path.
-// TEST_F(SymmetricTensorTest, PyTorchBackend_RemoteAccessCorrectness) {
-//   if (communicator_->size() == 1) {
-//     GTEST_SKIP() << "Skipping test for single device";
-//   }
-//   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
-//   if (backend == SymmetricMemoryBackend::Native) {
-//     GTEST_SKIP()
-//         << "PyTorch backend not selected; set NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to run";
-//   }
-
-//   const int64_t rank = communicator_->deviceId();
-//   const int64_t world_size = communicator_->size();
-
-//   at::Tensor local_tensor = SymmetricTensor::allocate(
-//       {256, 512}, at::ScalarType::Float, communicator_->device());
-//   SymmetricTensor sym_tensor(local_tensor);
-
-//   EXPECT_TRUE(local_tensor.is_cuda());
-//   EXPECT_EQ(local_tensor.numel(), 256 * 512);
-
-//   float local_value = static_cast<float>(rank + 200);
-//   local_tensor.fill_(local_value);
-
-//   sym_tensor.setupRemoteHandles();
-
-//   for (int64_t peer_rank = 0; peer_rank < world_size; ++peer_rank) {
-//     void* peer_ptr = sym_tensor.remoteTensor(peer_rank).data_ptr();
-//     EXPECT_NE(peer_ptr, nullptr);
-
-//     float peer_value;
-//     NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy(
-//         &peer_value, peer_ptr, sizeof(float), cudaMemcpyDeviceToHost));
-
-//     float expected_value = static_cast<float>(peer_rank + 200);
-//     EXPECT_FLOAT_EQ(peer_value, expected_value)
-//         << "Rank " << rank << " reading from rank " << peer_rank
-//         << " (PyTorch backend)";
-//   }
-// }
+TEST_F(SymmetricTensorTest, AllocateOnly) {
+  if (communicator_->size() == 1) {
+    GTEST_SKIP() << "Skipping test for single device";
+  }
+
+  at::Tensor local_tensor = SymmetricTensor::allocate(
+      {64, 128}, at::ScalarType::Float, communicator_->device());
+
+  EXPECT_TRUE(local_tensor.is_cuda());
+  EXPECT_EQ(local_tensor.scalar_type(), at::ScalarType::Float);
+  EXPECT_EQ(local_tensor.numel(), 64 * 128);
+  EXPECT_EQ(local_tensor.sizes()[0], 64);
+  EXPECT_EQ(local_tensor.sizes()[1], 128);
+
+  SymmetricTensor sym_tensor(local_tensor);
+  EXPECT_EQ(sym_tensor.localTensor().numel(), 64 * 128);
+  EXPECT_EQ(sym_tensor.localTensor().data_ptr(), local_tensor.data_ptr());
+}
 
 TEST_F(SymmetricTensorTest, PreallocatedTensor) {
   if (communicator_->size() == 1) {

From 67181c89fdee938d2d597ed8f18cbf4eb073e8c3 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Tue, 17 Mar 2026 17:30:21 +0200
Subject: [PATCH 04/16] multicast pending

---
 1                                     | 25 ++++++++++++
 csrc/multidevice/communicator.cpp     |  2 +-
 csrc/multidevice/communicator.h       |  4 --
 csrc/multidevice/symmetric_tensor.cpp | 58 ++++++---------------------
 fbuild.sh                             |  1 +
 5 files changed, 40 insertions(+), 50 deletions(-)
 create mode 100644 1

diff --git a/1 b/1
new file mode 100644
index 00000000000..de4cad83241
--- /dev/null
+++ b/1
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+export CC=clang-20
+export CXX=clang++-20
+export CUDAHOSTCXX=/usr/bin/clang++-20
+export LDFLAGS="-fuse-ld=mold"
+
+export NVFUSER_BUILD_ENABLE_PCH
+
+export UCC_HOME="/opt/hpcx/ucc"
+export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc"
+export UCX_HOME="/opt/hpcx/ucx"
+export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx"
+
+# export TORCH_CUDA_ARCH_LIST="9.0"
+
+export NVFUSER_BUILD_WITH_UCC=1
+export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser
+export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY
+
+# Enable debug mode, leave empty for non-debug compilation
+export NVFUSER_BUILD_BUILD_TYPE=Debug
+export RUN_CMAKE=""
+
+pip install -v -e ./python --no-build-isolation
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index b0642dc3f7c..c1dccfe734e 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -463,7 +463,7 @@ std::string Communicator::getSymmMemGroupKey(
 std::vector<RankType> all_ranks(size_);
 std::iota(all_ranks.begin(), all_ranks.end(), 0);
 CommunicatorBackend b = backend.value_or(default_backend_);
-(void)getBackendForTeam(all_ranks, b, "symm_mem_");
+(void)getBackendForTeam(all_ranks, b);
 return getTeamKey(all_ranks, b);
 }
 
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 57d3d48a750..58ccc2b153a 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -139,10 +139,6 @@ class NVF_API Communicator {
     return store_.get();
   }
 
-  c10::intrusive_ptr<c10d::Store> getStore() const {
-    return c10::intrusive_ptr<c10d::Store>(store_);
-  }
-
  private:
   Communicator(
       CommunicatorBackend backend = comm_backend_default,
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 7ca46e7a290..3a6cbf1d7a0 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -47,18 +47,7 @@ namespace {
   
     Communicator& comm = Communicator::getInstance();
     NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory");
-    // TODO: Remove after nccl version update
-    auto maybe_set_group_info = [&](const std::string& group_name) {
-      try {
-        c10d::symmetric_memory::set_group_info(
-            group_name,
-            static_cast<int>(comm.deviceId()),
-            static_cast<int>(comm.size()),
-            comm.getStore());
-      } catch (const c10::Error&) {
-        // already registered
-      }
-    };
+
     // Always return a valid group name
     if (backend == SymmetricMemoryBackend::PyTorchNccl) {
       NVF_CHECK(
@@ -67,14 +56,6 @@ namespace {
       
       const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
 
-      // This build expects default group name "0" to be registered.
-      // TODO: Remove after nccl version update
-      static std::once_flag group_once;
-      std::call_once(group_once, [&]() {
-        maybe_set_group_info("0");
-        maybe_set_group_info(group_name);
-      });
-
       comm.barrier(CommunicatorBackend::kNccl);
       return group_name;
     }
@@ -421,17 +402,19 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   }
 #ifdef NVFUSER_DISTRIBUTED
   // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles).
-  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
-  if (backend != SymmetricMemoryBackend::Native) {
-    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
-    torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
-        local_tensor_, group_name);
-    are_remote_tensors_setup_ = true;
-    if (torch_symm_handle_->has_multicast_support()) {
-      is_multicast_setup_ = true;
-      mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
+  if(is_multicast_setup_==false) {
+    SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+    if (backend != SymmetricMemoryBackend::Native) {
+      const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+      torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
+          local_tensor_, group_name);
+      are_remote_tensors_setup_ = true;
+      if (torch_symm_handle_->has_multicast_support()) {
+        is_multicast_setup_ = true;
+        mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
+      }
+      return;
     }
-    return;
   }
 #endif
   Communicator& comm = Communicator::getInstance();
@@ -515,21 +498,6 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
       rank, local_tensor_.sizes(), local_tensor_.scalar_type());
-    // Tried below code for older NCCL build, but it was not working.
-    // const int64_t storage_offset = local_tensor_.storage_offset();
-    // auto* handle = torch_symm_handle_.get();
-    // auto peer_ptr = handle->get_buffer_ptrs();
-    // NVF_CHECK(
-    //   peer_ptr != nullptr,
-    //   "Cannot get buffer across nodes, my rank: ",
-    //   handle->get_rank(),
-    //   ", peer: ",
-    //   rank);
-    // std::cout << "symm_mem type: " << typeid(*handle).name() << std::endl;
-    // const int ws = handle->get_world_size();
-    // NVF_CHECK(ws > 0, "SymmetricMemory world_size is 0");
-    // return handle->get_buffer(
-    //     rank, local_tensor_.sizes(), local_tensor_.scalar_type(), storage_offset);
   }
 #endif
 
diff --git a/fbuild.sh b/fbuild.sh
index e16a2e2cdd9..de4cad83241 100755
--- a/fbuild.sh
+++ b/fbuild.sh
@@ -2,6 +2,7 @@
 
 export CC=clang-20
 export CXX=clang++-20
+export CUDAHOSTCXX=/usr/bin/clang++-20
 export LDFLAGS="-fuse-ld=mold"
 
 export NVFUSER_BUILD_ENABLE_PCH

From eea57d807228479f6e3129fef5bbaeb0a03524c7 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 20 Mar 2026 02:11:35 +0200
Subject: [PATCH 05/16] all backends passing

---
 csrc/multidevice/communicator.cpp             |  3 +-
 csrc/multidevice/symmetric_tensor.cpp         | 78 ++++++++++---------
 csrc/multidevice/symmetric_tensor.h           |  2 +-
 .../cpp/test_multidevice_symmetric_tensor.cpp | 25 ++----
 4 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index c1dccfe734e..ba40835d9da 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -467,5 +467,4 @@ CommunicatorBackend b = backend.value_or(default_backend_);
 return getTeamKey(all_ranks, b);
 }
 
-} // namespace nvfuser
-
+} // namespace nvfuser
\ No newline at end of file
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 3a6cbf1d7a0..b53bfbe2235 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -32,9 +32,11 @@ namespace {
       switch (backend) {
         case SymmetricMemoryBackend::PyTorchNccl:
           name = "NCCL";
+          c10d::symmetric_memory::set_backend(name);
           break;
         case SymmetricMemoryBackend::PyTorchNvshmem:
           name = "NVSHMEM";
+          c10d::symmetric_memory::set_backend(name);
           break;
         case SymmetricMemoryBackend::PyTorchCuda:
           name = "CUDA";
@@ -42,30 +44,39 @@ namespace {
         default:
           NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
       }
-      c10d::symmetric_memory::set_backend(name);
     });
-  
+
     Communicator& comm = Communicator::getInstance();
     NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory");
 
     // Always return a valid group name
-    if (backend == SymmetricMemoryBackend::PyTorchNccl) {
+    if (backend != SymmetricMemoryBackend::Native) {
       NVF_CHECK(
           comm.isBackendAvailable(CommunicatorBackend::kNccl),
           "NCCL backend is required for symmetric_memory_backend(nccl)");
-      
+
       const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
 
+      static std::once_flag pg0_once;
+      std::call_once(pg0_once, [&]() {
+        try {
+          (void)c10d::resolve_process_group("0");
+        } catch (const c10::Error&) {
+          auto pg = c10d::resolve_process_group(group_name);
+          c10d::register_process_group("0", pg);
+        }
+      });
+
       comm.barrier(CommunicatorBackend::kNccl);
       return group_name;
     }
-  
+
     NVF_ERROR(
         false,
         "No c10d backend available for symmetric memory rendezvous. "
         "Expected NCCL or UCC process group.");
   }
-#endif  
+#endif
 
 
 // Returns the allocation granularity for symmetric memory.
@@ -149,7 +160,7 @@ at::Tensor SymmetricTensor::allocate(
     }
     // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name.
     c10::optional<std::string> alloc_group_name =
-        (backend == SymmetricMemoryBackend::PyTorchNccl)
+        (backend == SymmetricMemoryBackend::PyTorchNccl || backend == SymmetricMemoryBackend::PyTorchNvshmem)
         ? c10::nullopt
         : c10::optional<std::string>(group_name);
     return c10d::symmetric_memory::empty_strided_p2p(
@@ -158,7 +169,7 @@ at::Tensor SymmetricTensor::allocate(
         dtype,
         device,
         alloc_group_name,
-        c10::nullopt);
+        /*alloc_id=*/c10::nullopt);
   }
 #else
   if (backend != SymmetricMemoryBackend::Native) {
@@ -226,6 +237,9 @@ at::Tensor SymmetricTensor::allocate(
 }
 
 std::string SymmetricTensor::validate(at::Tensor tensor) {
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    return "";
+  }
   int is_vmm_supported;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
       &is_vmm_supported,
@@ -402,19 +416,17 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   }
 #ifdef NVFUSER_DISTRIBUTED
   // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles).
-  if(is_multicast_setup_==false) {
-    SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
-    if (backend != SymmetricMemoryBackend::Native) {
-      const std::string group_name = ensurePyTorchSymmMemBackend(backend);
-      torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
-          local_tensor_, group_name);
-      are_remote_tensors_setup_ = true;
-      if (torch_symm_handle_->has_multicast_support()) {
-        is_multicast_setup_ = true;
-        mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
-      }
-      return;
+  SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
+  if (backend != SymmetricMemoryBackend::Native) {
+    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+    torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
+        local_tensor_, group_name);
+    are_remote_tensors_setup_ = true;
+    if (torch_symm_handle_->has_multicast_support()) {
+      is_multicast_setup_ = true;
+      mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
     }
+    return;
   }
 #endif
   Communicator& comm = Communicator::getInstance();
@@ -512,13 +524,6 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 }
 
 void* SymmetricTensor::multicastPtr() const {
-#ifdef NVFUSER_DISTRIBUTED
-  if (torch_symm_handle_) {
-    return torch_symm_handle_->has_multicast_support()
-        ? torch_symm_handle_->get_multicast_ptr()
-        : nullptr;
-  }
-#endif
   NVF_CHECK(is_multicast_setup_, "Multicast not setup");
   return mc_ptr_;
 }
@@ -529,7 +534,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   }
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
-    NVF_ERROR(
+    NVF_THROW(
         false,
         "Contiguous view is not yet supported for PyTorch symmetric memory backend. "
         "Use native backend for SymmetricContiguousView.");
@@ -600,7 +605,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 at::Tensor SymmetricTensor::getContiguousView() const {
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
-    NVF_ERROR(
+    NVF_THROW(
         false,
         "Contiguous view is not yet supported for PyTorch symmetric memory backend.");
   }
@@ -612,16 +617,19 @@ at::Tensor SymmetricTensor::getContiguousView() const {
 void SymmetricTensor::setupMulticast(
     int64_t exporter_rank,
     const std::string& tag) {
-#ifdef NVFUSER_DISTRIBUTED
-  if (torch_symm_handle_) {
-    return; // PyTorch backend: multicast handled by backend if supported
-  }
-#endif
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     return;
   }
-
+#ifdef NVFUSER_DISTRIBUTED
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    if (!torch_symm_handle_) {
+      setupRemoteHandles(tag);
+      NVF_CHECK(torch_symm_handle_->has_multicast_support(), "Multicast not supported");
+    }
+    return;
+  }
+#endif
   Communicator& comm = Communicator::getInstance();
   const int64_t my_rank = comm.deviceId();
   const int64_t local_rank = comm.local_rank();
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index eedf3602846..20f4d63ccda 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -25,7 +25,7 @@ namespace nvfuser {
 // interoperability and support for pre-allocated user buffers.
 //
 // Backends (see SymmetricMemoryBackend in ipc_utils.h):
-// - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained.
+// - Native (default): Fuser's own CUDA VMM + IPC implementation
 // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory
 //   (torch.distributed._symmetric_memory) with the chosen transport backend.
 //   Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp
index 397189541ad..020ea0b8745 100644
--- a/tests/cpp/test_multidevice_symmetric_tensor.cpp
+++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp
@@ -5,9 +5,9 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
-#include "multidevice/ipc_utils.h"
 #include "multidevice/symmetric_tensor.h"
 #include "tests/cpp/multidevice.h"
+#include "multidevice/ipc_utils.h"
 
 namespace nvfuser {
 
@@ -55,25 +55,6 @@ TEST_F(SymmetricTensorTest, BasicAllocation) {
   }
 }
 
-TEST_F(SymmetricTensorTest, AllocateOnly) {
-  if (communicator_->size() == 1) {
-    GTEST_SKIP() << "Skipping test for single device";
-  }
-
-  at::Tensor local_tensor = SymmetricTensor::allocate(
-      {64, 128}, at::ScalarType::Float, communicator_->device());
-
-  EXPECT_TRUE(local_tensor.is_cuda());
-  EXPECT_EQ(local_tensor.scalar_type(), at::ScalarType::Float);
-  EXPECT_EQ(local_tensor.numel(), 64 * 128);
-  EXPECT_EQ(local_tensor.sizes()[0], 64);
-  EXPECT_EQ(local_tensor.sizes()[1], 128);
-
-  SymmetricTensor sym_tensor(local_tensor);
-  EXPECT_EQ(sym_tensor.localTensor().numel(), 64 * 128);
-  EXPECT_EQ(sym_tensor.localTensor().data_ptr(), local_tensor.data_ptr());
-}
-
 TEST_F(SymmetricTensorTest, PreallocatedTensor) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
@@ -188,6 +169,9 @@ TEST_F(SymmetricTensorTest, ContiguousView) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
   }
+  if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
+    GTEST_SKIP() << "Skipping test for Pytorch symmetric memory backend";
+  }
 
   const int64_t rank = communicator_->deviceId();
   const int64_t world_size = communicator_->size();
@@ -252,6 +236,7 @@ TEST_F(SymmetricTensorTest, SmallAllocation) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
   }
+  std::cout << "Vishal chishta" << std::endl;
 
   const int64_t rank = communicator_->deviceId();
   const int64_t world_size = communicator_->size();

From a9ddffdd3216b6c6d6919c51d5fc197e15ccaff2 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 20 Mar 2026 22:24:39 +0200
Subject: [PATCH 06/16] delete build file

---
 fbuild.sh | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100755 fbuild.sh

diff --git a/fbuild.sh b/fbuild.sh
deleted file mode 100755
index de4cad83241..00000000000
--- a/fbuild.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-export CC=clang-20
-export CXX=clang++-20
-export CUDAHOSTCXX=/usr/bin/clang++-20
-export LDFLAGS="-fuse-ld=mold"
-
-export NVFUSER_BUILD_ENABLE_PCH
-
-export UCC_HOME="/opt/hpcx/ucc"
-export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc"
-export UCX_HOME="/opt/hpcx/ucx"
-export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx"
-
-# export TORCH_CUDA_ARCH_LIST="9.0"
-
-export NVFUSER_BUILD_WITH_UCC=1
-export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser
-export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY
-
-# Enable debug mode, leave empty for non-debug compilation
-export NVFUSER_BUILD_BUILD_TYPE=Debug
-export RUN_CMAKE=""
-
-pip install -v -e ./python --no-build-isolation

From 8e62ccce29799b9b137685b1d34897f72af56343 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Tue, 24 Mar 2026 12:48:23 +0200
Subject: [PATCH 07/16] Lint errors and review comments

---
 1                                             |  25 ----
 csrc/multidevice/communicator.cpp             |  14 +-
 csrc/multidevice/communicator.h               |   2 +-
 csrc/multidevice/ipc_utils.h                  |   5 +-
 csrc/multidevice/symmetric_tensor.cpp         | 126 ++++++++++--------
 csrc/multidevice/symmetric_tensor.h           |   6 +-
 .../cpp/test_multidevice_symmetric_tensor.cpp |   3 +-
 7 files changed, 83 insertions(+), 98 deletions(-)
 delete mode 100644 1

diff --git a/1 b/1
deleted file mode 100644
index de4cad83241..00000000000
--- a/1
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-export CC=clang-20
-export CXX=clang++-20
-export CUDAHOSTCXX=/usr/bin/clang++-20
-export LDFLAGS="-fuse-ld=mold"
-
-export NVFUSER_BUILD_ENABLE_PCH
-
-export UCC_HOME="/opt/hpcx/ucc"
-export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc"
-export UCX_HOME="/opt/hpcx/ucx"
-export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx"
-
-# export TORCH_CUDA_ARCH_LIST="9.0"
-
-export NVFUSER_BUILD_WITH_UCC=1
-export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser
-export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY
-
-# Enable debug mode, leave empty for non-debug compilation
-export NVFUSER_BUILD_BUILD_TYPE=Debug
-export RUN_CMAKE=""
-
-pip install -v -e ./python --no-build-isolation
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index ba40835d9da..30bed276b00 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -417,7 +417,7 @@ c10d::Backend* Communicator::getBackendForTeam(
 #endif
 #if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
     std::optional<c10d::ProcessGroup::BackendType> pg_backend =
-    (b == CommunicatorBackend::kNccl)
+        (b == CommunicatorBackend::kNccl)
         ? std::optional<c10d::ProcessGroup::BackendType>(
               c10d::ProcessGroup::BackendType::NCCL)
         : std::nullopt;
@@ -459,12 +459,12 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
 }
 
 std::string Communicator::getSymmMemGroupKey(
-  std::optional<CommunicatorBackend> backend) {
-std::vector<RankType> all_ranks(size_);
-std::iota(all_ranks.begin(), all_ranks.end(), 0);
-CommunicatorBackend b = backend.value_or(default_backend_);
-(void)getBackendForTeam(all_ranks, b);
-return getTeamKey(all_ranks, b);
+    std::optional<CommunicatorBackend> backend) {
+  std::vector<RankType> all_ranks(size_);
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  CommunicatorBackend b = backend.value_or(default_backend_);
+  (void)getBackendForTeam(all_ranks, b);
+  return getTeamKey(all_ranks, b);
 }
 
 } // namespace nvfuser
\ No newline at end of file
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 58ccc2b153a..e7efc8dbadf 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -123,7 +123,7 @@ class NVF_API Communicator {
 
   // Returns the world process-group name for the given backend.
   std::string getSymmMemGroupKey(
-    std::optional<CommunicatorBackend> backend = std::nullopt);
+      std::optional<CommunicatorBackend> backend = std::nullopt);
 
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h
index 0cfd6586e47..9346ad69e6f 100644
--- a/csrc/multidevice/ipc_utils.h
+++ b/csrc/multidevice/ipc_utils.h
@@ -35,8 +35,9 @@ MulticastProtocol getMulticastProtocol();
 
 // Backend for symmetric memory allocation and rendezvous.
 // Native: Fuser's own CUDA VMM + IPC implementation (default, maintained).
-// PyTorch*: Use PyTorch's symmetric memory (torch.distributed._symmetric_memory)
-// with the given transport backend (Nccl, Nvshmem, or Cuda).
+// PyTorch*: Use PyTorch's symmetric memory
+// (torch.distributed._symmetric_memory) with the given transport backend (Nccl,
+// Nvshmem, or Cuda).
 enum class SymmetricMemoryBackend {
   Native,
   PyTorchNccl,
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index b53bfbe2235..acefdca779e 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -16,8 +16,8 @@
 #include "multidevice/utils.h"
 
 #ifdef NVFUSER_DISTRIBUTED
-#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #endif
 
 namespace nvfuser {
@@ -25,57 +25,60 @@ namespace nvfuser {
 namespace {
 
 #ifdef NVFUSER_DISTRIBUTED
-  std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
-    static std::once_flag once;
-    std::call_once(once, [backend]() {
-      const char* name = nullptr;
-      switch (backend) {
-        case SymmetricMemoryBackend::PyTorchNccl:
-          name = "NCCL";
-          c10d::symmetric_memory::set_backend(name);
-          break;
-        case SymmetricMemoryBackend::PyTorchNvshmem:
-          name = "NVSHMEM";
-          c10d::symmetric_memory::set_backend(name);
-          break;
-        case SymmetricMemoryBackend::PyTorchCuda:
-          name = "CUDA";
-          break;
-        default:
-          NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
-      }
-    });
-
-    Communicator& comm = Communicator::getInstance();
-    NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory");
+std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
+  static std::once_flag once;
+  std::call_once(once, [backend]() {
+    const char* name = nullptr;
+    switch (backend) {
+      case SymmetricMemoryBackend::PyTorchNccl:
+        name = "NCCL";
+        c10d::symmetric_memory::set_backend(name);
+        break;
+      case SymmetricMemoryBackend::PyTorchNvshmem:
+        name = "NVSHMEM";
+        c10d::symmetric_memory::set_backend(name);
+        break;
+      case SymmetricMemoryBackend::PyTorchCuda:
+        name = "CUDA";
+        // set_backend(name) is not required for CUDA backend
+        break;
+      default:
+        NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend");
+    }
+  });
 
-    // Always return a valid group name
-    if (backend != SymmetricMemoryBackend::Native) {
-      NVF_CHECK(
-          comm.isBackendAvailable(CommunicatorBackend::kNccl),
-          "NCCL backend is required for symmetric_memory_backend(nccl)");
-
-      const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
-
-      static std::once_flag pg0_once;
-      std::call_once(pg0_once, [&]() {
-        try {
-          (void)c10d::resolve_process_group("0");
-        } catch (const c10::Error&) {
-          auto pg = c10d::resolve_process_group(group_name);
-          c10d::register_process_group("0", pg);
-        }
-      });
+  Communicator& comm = Communicator::getInstance();
+  NVF_CHECK(
+      comm.is_available(), "Communicator not available for symmetric memory");
 
-      comm.barrier(CommunicatorBackend::kNccl);
-      return group_name;
-    }
+  // Always return a valid group name
+  if (backend != SymmetricMemoryBackend::Native) {
+    NVF_CHECK(
+        comm.isBackendAvailable(CommunicatorBackend::kNccl),
+        "NCCL backend is required for non-native symmetric memory backend");
+
+    const std::string group_name =
+        comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
+
+    static std::once_flag pg0_once;
+    std::call_once(pg0_once, [&]() {
+      try {
+        (void)c10d::resolve_process_group("0");
+      } catch (const c10::Error&) {
+        auto pg = c10d::resolve_process_group(group_name);
+        c10d::register_process_group("0", pg);
+      }
+    });
 
-    NVF_ERROR(
-        false,
-        "No c10d backend available for symmetric memory rendezvous. "
-        "Expected NCCL or UCC process group.");
+    comm.barrier(CommunicatorBackend::kNccl);
+    return group_name;
   }
+
+  NVF_ERROR(
+      false,
+      "No c10d backend available for symmetric memory rendezvous. "
+      "Expected NCCL or UCC process group.");
+}
 #endif
 
 
@@ -160,7 +163,8 @@ at::Tensor SymmetricTensor::allocate(
     }
     // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name.
     c10::optional<std::string> alloc_group_name =
-        (backend == SymmetricMemoryBackend::PyTorchNccl || backend == SymmetricMemoryBackend::PyTorchNvshmem)
+        (backend == SymmetricMemoryBackend::PyTorchNccl ||
+          backend == SymmetricMemoryBackend::PyTorchNvshmem)
         ? c10::nullopt
         : c10::optional<std::string>(group_name);
     return c10d::symmetric_memory::empty_strided_p2p(
@@ -176,7 +180,8 @@ at::Tensor SymmetricTensor::allocate(
     NVF_ERROR(
         false,
         "PyTorch symmetric memory backend requires a build with "
-        "NVFUSER_DISTRIBUTED. Use NVFUSER_ENABLE=symmetric_memory_backend(native) "
+        "NVFUSER_DISTRIBUTED. Use "
+        "NVFUSER_ENABLE=symmetric_memory_backend(native) "
         "or do not set symmetric_memory_backend.");
   }
 #endif
@@ -415,12 +420,13 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     return;
   }
 #ifdef NVFUSER_DISTRIBUTED
-  // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles).
+  // PyTorch backend: perform rendezvous here (lazy, on first
+  // setupRemoteHandles).
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
     const std::string group_name = ensurePyTorchSymmMemBackend(backend);
-    torch_symm_handle_ = c10d::symmetric_memory::rendezvous(
-        local_tensor_, group_name);
+    torch_symm_handle_ =
+        c10d::symmetric_memory::rendezvous(local_tensor_, group_name);
     are_remote_tensors_setup_ = true;
     if (torch_symm_handle_->has_multicast_support()) {
       is_multicast_setup_ = true;
@@ -509,7 +515,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
-      rank, local_tensor_.sizes(), local_tensor_.scalar_type());
+        rank, local_tensor_.sizes(), local_tensor_.scalar_type());
   }
 #endif
 
@@ -535,8 +541,8 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
     NVF_THROW(
-        false,
-        "Contiguous view is not yet supported for PyTorch symmetric memory backend. "
+        "Contiguous view is not yet supported for PyTorch symmetric memory "
+        "backend."
         "Use native backend for SymmetricContiguousView.");
   }
 #endif
@@ -606,8 +612,8 @@ at::Tensor SymmetricTensor::getContiguousView() const {
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
     NVF_THROW(
-        false,
-        "Contiguous view is not yet supported for PyTorch symmetric memory backend.");
+        "Contiguous view is not yet supported for PyTorch symmetric memory "
+        "backend.");
   }
 #endif
   NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup");
@@ -625,7 +631,9 @@ void SymmetricTensor::setupMulticast(
   if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
     if (!torch_symm_handle_) {
       setupRemoteHandles(tag);
-      NVF_CHECK(torch_symm_handle_->has_multicast_support(), "Multicast not supported");
+      NVF_CHECK(
+          torch_symm_handle_->has_multicast_support(),
+          "Multicast not supported");
     }
     return;
   }
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index 20f4d63ccda..02e47d8a7fc 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -28,7 +28,8 @@ namespace nvfuser {
 // - Native (default): Fuser's own CUDA VMM + IPC implementation
 // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory
 //   (torch.distributed._symmetric_memory) with the chosen transport backend.
-//   Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
+//   Select via
+//   NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda).
 //   Native remains the default when the option is not set.
 class SymmetricTensor {
  public:
@@ -85,7 +86,8 @@ class SymmetricTensor {
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
 #ifdef NVFUSER_DISTRIBUTED
-  c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory> torch_symm_handle_;
+  c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>
+      torch_symm_handle_;
 #endif
 };
 
diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp
index 020ea0b8745..e39a41861c2 100644
--- a/tests/cpp/test_multidevice_symmetric_tensor.cpp
+++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp
@@ -5,9 +5,9 @@
  * SPDX-License-Identifier: BSD-3-Clause
  */
 // clang-format on
+#include "multidevice/ipc_utils.h"
 #include "multidevice/symmetric_tensor.h"
 #include "tests/cpp/multidevice.h"
-#include "multidevice/ipc_utils.h"
 
 namespace nvfuser {
 
@@ -236,7 +236,6 @@ TEST_F(SymmetricTensorTest, SmallAllocation) {
   if (communicator_->size() == 1) {
     GTEST_SKIP() << "Skipping test for single device";
   }
-  std::cout << "Vishal chishta" << std::endl;
 
   const int64_t rank = communicator_->deviceId();
   const int64_t world_size = communicator_->size();

From 1be0134e8a6349bf227cb525b9c858ab90c8c1fa Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Tue, 24 Mar 2026 13:26:41 +0200
Subject: [PATCH 08/16] fix 3 lint errors

---
 csrc/multidevice/communicator.cpp     | 2 +-
 csrc/multidevice/symmetric_tensor.cpp | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 30bed276b00..88cad27d012 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -467,4 +467,4 @@ std::string Communicator::getSymmMemGroupKey(
   return getTeamKey(all_ranks, b);
 }
 
-} // namespace nvfuser
\ No newline at end of file
+} // namespace nvfuser
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index acefdca779e..2fc52284b92 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -64,7 +64,9 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
     std::call_once(pg0_once, [&]() {
       try {
         (void)c10d::resolve_process_group("0");
-      } catch (const c10::Error&) {
+      } catch (const std::exception&) {
+        // resolve_process_group throws c10::Error
+        // (derives from std::exception)
         auto pg = c10d::resolve_process_group(group_name);
         c10d::register_process_group("0", pg);
       }
@@ -164,7 +166,7 @@ at::Tensor SymmetricTensor::allocate(
     // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name.
     c10::optional<std::string> alloc_group_name =
         (backend == SymmetricMemoryBackend::PyTorchNccl ||
-          backend == SymmetricMemoryBackend::PyTorchNvshmem)
+        backend == SymmetricMemoryBackend::PyTorchNvshmem)
         ? c10::nullopt
         : c10::optional<std::string>(group_name);
     return c10d::symmetric_memory::empty_strided_p2p(

From 3596301f42b3245f9faefdd49eed0997bddd319d Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 25 Mar 2026 10:12:59 +0200
Subject: [PATCH 09/16] Fix clang-tidy errors

---
 csrc/multidevice/communicator.cpp     | 19 +++----
 csrc/multidevice/communicator.h       | 12 -----
 csrc/multidevice/ipc_utils.cpp        | 28 +++++-----
 csrc/multidevice/ipc_utils.h          |  4 +-
 csrc/multidevice/symmetric_tensor.cpp | 77 +++++++++++++++------------
 5 files changed, 68 insertions(+), 72 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 88cad27d012..b818bbb807c 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -14,13 +14,9 @@
 #include <numeric>
 
 #ifdef NVFUSER_DISTRIBUTED
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
-#endif
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-#endif
 #include <torch/csrc/distributed/c10d/exception.h>
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -127,7 +123,8 @@ bool parseEnv(
   }
 
   // retrieves master port
-  if ((env = std::getenv("NVFUSER_MASTER_PORT")) != nullptr) {
+  env = std::getenv("NVFUSER_MASTER_PORT");
+  if (env != nullptr) {
     master_port = std::atoi(env);
   } else {
     LOG(INFO) << "The environment variable NVFUSER_MASTER_PORT has not been "
@@ -254,10 +251,10 @@ void waitForDebuggerAtRanks(
     std::cerr << "Process " << pid
               << " is waiting for the debugger. To continue debugging, "
               << "start gdb, `attach " << pid
-              << "`, `set var waiting=false`, and `fini`." << std::endl;
+              << "`, `set var waiting=false`, and `fini`.\n";
     while (waiting) { // Please change `waiting` in the debugger.
     }
-    std::cerr << "Process " << getpid() << " finished waiting." << std::endl;
+    std::cerr << "Process " << getpid() << " finished waiting.\n";
   }
 
   if (communicator->is_available()) {
@@ -369,12 +366,10 @@ void Communicator::cleanup() {
     }
   }
 #endif
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
   for (const auto& entry : process_groups_) {
     c10d::unregister_process_group(entry.first);
   }
   process_groups_.clear();
-#endif
   backends_.clear();
 }
 
@@ -400,7 +395,7 @@ c10d::Backend* Communicator::getBackendForTeam(
 #ifdef NVFUSER_DISTRIBUTED
     backends_[team_key] = [&]() -> c10::intrusive_ptr<c10d::Backend> {
       // check that the caller's rank belongs to the requested team
-      auto rank_it = std::find(team.begin(), team.end(), deviceId());
+      auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
       if (rank_it == team.end()) {
         return nullptr;
       }
@@ -415,14 +410,13 @@ c10d::Backend* Communicator::getBackendForTeam(
 #else
     backends_[team_key] = nullptr;
 #endif
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
     std::optional<c10d::ProcessGroup::BackendType> pg_backend =
         (b == CommunicatorBackend::kNccl)
         ? std::optional<c10d::ProcessGroup::BackendType>(
               c10d::ProcessGroup::BackendType::NCCL)
         : std::nullopt;
     if (backends_[team_key] != nullptr && pg_backend.has_value()) {
-      auto rank_it = std::find(team.begin(), team.end(), deviceId());
+      auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
       RankType team_rank = std::distance(team.begin(), rank_it);
 
       auto pg = c10::make_intrusive<c10d::ProcessGroup>(
@@ -436,7 +430,6 @@ c10d::Backend* Communicator::getBackendForTeam(
       c10d::register_process_group(team_key, pg);
       process_groups_[team_key] = std::move(pg);
     }
-#endif
   }
   return backends_.at(team_key).get();
 }
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index e7efc8dbadf..0b3733505bf 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -11,19 +11,9 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 
-#if defined(NVFUSER_DISTRIBUTED) && \
-    __has_include(<torch/csrc/distributed/c10d/GroupRegistry.hpp>) && \
-    __has_include(<torch/csrc/distributed/c10d/ProcessGroup.hpp>)
-#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 1
-#else
-#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 0
-#endif
-
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/Backend.hpp>
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
-#endif
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Work.hpp>
 #else
@@ -168,11 +158,9 @@ class NVF_API Communicator {
   c10::intrusive_ptr<c10d::TCPStore> store_;
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
-#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP
   // c10d process-group wrappers registered for symmetric-memory rendezvous.
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
       process_groups_;
-#endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp
index 01bdf949044..ab1ad413c87 100644
--- a/csrc/multidevice/ipc_utils.cpp
+++ b/csrc/multidevice/ipc_utils.cpp
@@ -38,7 +38,7 @@ int createIpcSocket(const std::string& path) {
   int sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
   NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno));
 
-  struct sockaddr_un addr;
+  struct sockaddr_un addr {};
   setupSockAddr(addr, path);
 
   // For abstract namespace, len is usually calculated specifically, but for
@@ -69,7 +69,7 @@ void sendFd(
   int sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
   NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno));
 
-  struct sockaddr_un addr;
+  struct sockaddr_un addr {};
   setupSockAddr(addr, path);
   socklen_t addrlen = sizeof(addr.sun_family) + path.length();
 
@@ -77,8 +77,9 @@ void sendFd(
   int ret = -1;
   for (int i = 0; i < 100; ++i) {
     ret = connect(sockfd, (struct sockaddr*)&addr, addrlen);
-    if (ret == 0)
+    if (ret == 0) {
       break;
+    }
     usleep(10000); // 10ms
   }
   if (ret < 0) {
@@ -86,14 +87,16 @@ void sendFd(
     NVF_CHECK(false, "Failed to connect to ", path, ": ", strerror(errno));
   }
 
-  struct msghdr msg = {0};
-  struct cmsghdr* cmsg;
+  struct msghdr msg {};
+  struct cmsghdr* cmsg = nullptr;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
   char buf[CMSG_SPACE(sizeof(int))];
 
   // If no header data, send at least one byte
   char dummy = '.';
-  struct iovec iov;
+  struct iovec iov {};
   if (header_data && header_len > 0) {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
     iov.iov_base = const_cast<void*>(header_data);
     iov.iov_len = header_len;
   } else {
@@ -121,21 +124,22 @@ void sendFd(
 }
 
 int recvFd(int socket_fd, void* header_data, size_t header_len) {
-  struct sockaddr_un client_addr;
+  struct sockaddr_un client_addr {};
   socklen_t client_len = sizeof(client_addr);
   int client_fd =
       accept(socket_fd, (struct sockaddr*)&client_addr, &client_len);
   NVF_CHECK(client_fd >= 0, "Failed to accept connection: ", strerror(errno));
 
-  struct msghdr msg = {0};
-  struct cmsghdr* cmsg;
+  struct msghdr msg {};
+  struct cmsghdr* cmsg = nullptr;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
   char buf[CMSG_SPACE(sizeof(int))];
 
   // If header_len > 0, we expect that much data.
   // Note: recvmsg might return fewer bytes if strict requirements aren't met,
   // but for local unix sockets with small payloads, it usually delivers all.
-  char dummy;
-  struct iovec iov;
+  char dummy = '.';
+  struct iovec iov {};
   if (header_data && header_len > 0) {
     iov.iov_base = header_data;
     iov.iov_len = header_len;
@@ -168,7 +172,7 @@ int recvFd(int socket_fd, void* header_data, size_t header_len) {
 
   int recv_fd = -1;
   cmsg = CMSG_FIRSTHDR(&msg);
-  if (cmsg != NULL && cmsg->cmsg_len == CMSG_LEN(sizeof(int))) {
+  if (cmsg != nullptr && cmsg->cmsg_len == CMSG_LEN(sizeof(int))) {
     if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) {
       memcpy(&recv_fd, CMSG_DATA(cmsg), sizeof(int));
     }
diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h
index 9346ad69e6f..f0b69fcdbf1 100644
--- a/csrc/multidevice/ipc_utils.h
+++ b/csrc/multidevice/ipc_utils.h
@@ -29,7 +29,7 @@ const T& fromBytes(const std::vector<uint8_t>& bytes) {
 
 // IPC Utils for sharing file descriptors
 
-enum class MulticastProtocol { Memcpy, Multimem, BatchMemcpy };
+enum class MulticastProtocol : uint8_t { Memcpy, Multimem, BatchMemcpy };
 
 MulticastProtocol getMulticastProtocol();
 
@@ -38,7 +38,7 @@ MulticastProtocol getMulticastProtocol();
 // PyTorch*: Use PyTorch's symmetric memory
 // (torch.distributed._symmetric_memory) with the given transport backend (Nccl,
 // Nvshmem, or Cuda).
-enum class SymmetricMemoryBackend {
+enum class SymmetricMemoryBackend : uint8_t {
   Native,
   PyTorchNccl,
   PyTorchNvshmem,
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 2fc52284b92..4af6f917673 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -65,7 +65,7 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
       try {
         (void)c10d::resolve_process_group("0");
       } catch (const std::exception&) {
-        // resolve_process_group throws c10::Error
+        // resolve_process_group throws c10d Error
         // (derives from std::exception)
         auto pg = c10d::resolve_process_group(group_name);
         c10d::register_process_group("0", pg);
@@ -83,7 +83,6 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
 }
 #endif
 
-
 // Returns the allocation granularity for symmetric memory.
 // - query_mcast_granularity: if true, considers multicast granularity
 // - query_mcast_recommended_granularity: if true, uses recommended (larger)
@@ -99,7 +98,7 @@ int64_t getGranularityForSymmetricMemory(
 
 #if (CUDA_VERSION >= NVF_MIN_CUDA_FOR_MCAST)
   if (!query_mcast_granularity) {
-    return alloc_granularity;
+    return static_cast<int64_t>(alloc_granularity);
   }
 
   // Check if device supports multicast before querying multicast granularity
@@ -110,7 +109,7 @@ int64_t getGranularityForSymmetricMemory(
       prop.location.id));
 
   if (is_multicast_supported == 0) {
-    return alloc_granularity;
+    return static_cast<int64_t>(alloc_granularity);
   }
 
   // Device supports multicast, query multicast granularity
@@ -138,7 +137,7 @@ int64_t getGranularityForSymmetricMemory(
     granularity = mcast_rec_granularity;
   }
 
-  return std::max(alloc_granularity, granularity);
+  return static_cast<int64_t>(std::max(alloc_granularity, granularity));
 #else
   (void)requested_size_bytes;
   (void)query_mcast_granularity;
@@ -166,7 +165,7 @@ at::Tensor SymmetricTensor::allocate(
     // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name.
     c10::optional<std::string> alloc_group_name =
         (backend == SymmetricMemoryBackend::PyTorchNccl ||
-        backend == SymmetricMemoryBackend::PyTorchNvshmem)
+         backend == SymmetricMemoryBackend::PyTorchNvshmem)
         ? c10::nullopt
         : c10::optional<std::string>(group_name);
     return c10d::symmetric_memory::empty_strided_p2p(
@@ -188,25 +187,26 @@ at::Tensor SymmetricTensor::allocate(
   }
 #endif
 
-  int is_vmm_supported;
+  int is_vmm_supported = 0;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
       &is_vmm_supported,
       CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
       device.index()));
   NVF_ERROR(is_vmm_supported, "Device does not support VMM");
 
-  const int64_t numel = std::accumulate(
-      sizes.begin(), sizes.end(), 1, std::multiplies<int64_t>());
-  const int64_t element_size = c10::elementSize(dtype);
+  const int64_t numel = static_cast<int64_t>(
+      std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>()));
+  const int64_t element_size = static_cast<int64_t>(c10::elementSize(dtype));
   const int64_t alloc_size = numel * element_size;
 
   CUmemAllocationProp prop{};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = static_cast<int>(device.index());
+  prop.location.id =
+      static_cast<int>(static_cast<unsigned char>(device.index()));
   prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
-  size_t granularity =
+  int64_t granularity =
       getGranularityForSymmetricMemory(prop, static_cast<size_t>(alloc_size));
   int64_t rounded_size =
       ((alloc_size + granularity - 1) / granularity) * granularity;
@@ -221,20 +221,22 @@ at::Tensor SymmetricTensor::allocate(
 
   CUmemAccessDesc access{};
   access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  access.location.id = static_cast<int>(device.index());
+  access.location.id =
+      static_cast<int>(static_cast<unsigned char>(device.index()));
   access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(ptr, rounded_size, &access, 1));
 
   std::vector<int64_t> strides(sizes.size());
   strides.back() = 1;
-  for (int64_t i = strides.size() - 2; i >= 0; --i) {
+  for (size_t i = strides.size() - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * sizes[i + 1];
   }
 
   return at::from_blob(
-      (void*)ptr,
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
+      reinterpret_cast<void*>(ptr),
       sizes,
-      std::move(strides),
+      strides,
       [=](void* ptr) {
         cuMemUnmap((CUdeviceptr)(ptr), rounded_size);
         cuMemAddressFree((CUdeviceptr)(ptr), rounded_size);
@@ -247,7 +249,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) {
   if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
     return "";
   }
-  int is_vmm_supported;
+  int is_vmm_supported = 0;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
       &is_vmm_supported,
       CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
@@ -260,7 +262,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) {
 
   CUmemLocation location{};
   location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  location.id = Communicator::getInstance().local_rank();
+  location.id = static_cast<int>(Communicator::getInstance().local_rank());
   unsigned long long flags = 0;
   NVFUSER_CUDA_SAFE_CALL(cuMemGetAccess(&flags, &location, ptr));
   if (flags != CU_MEM_ACCESS_FLAGS_PROT_READWRITE) {
@@ -269,7 +271,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) {
 
   CUmemGenericAllocationHandle alloc_handle = 0;
   NVFUSER_CUDA_SAFE_CALL(
-      cuMemRetainAllocationHandle(&alloc_handle, (void*)ptr));
+      cuMemRetainAllocationHandle(&alloc_handle, tensor.data_ptr()));
 
   CUmemAllocationProp prop{};
   NVFUSER_CUDA_SAFE_CALL(
@@ -358,9 +360,9 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
   alloc_handles_.resize(world_size_);
   remote_ptrs_.resize(world_size_);
 
-  CUmemGenericAllocationHandle local_handle;
-  NVFUSER_CUDA_SAFE_CALL(cuMemRetainAllocationHandle(
-      &local_handle, reinterpret_cast<void*>(local_ptr)));
+  CUmemGenericAllocationHandle local_handle = 0;
+  NVFUSER_CUDA_SAFE_CALL(
+      cuMemRetainAllocationHandle(&local_handle, local_tensor_.data_ptr()));
 
   alloc_handles_[my_device_id_] = local_handle;
   remote_ptrs_[my_device_id_] = local_ptr;
@@ -386,8 +388,9 @@ SymmetricTensor::~SymmetricTensor() {
       // for now. cuMulticastUnbind(mcast_handle_, cu_dev_, 0, aligned_size_);
       cuMemRelease(mcast_handle_);
     }
-    if (peer_fd_ >= 0)
+    if (peer_fd_ >= 0) {
       close(peer_fd_);
+    }
   }
 #endif
 
@@ -446,7 +449,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   NVFUSER_CUDA_SAFE_CALL(cuMemGetAddressRange(&base_ptr, &va_size, local_ptr));
   size_t offset = local_ptr - base_ptr;
 
-  int shared_fd;
+  int shared_fd = -1;
   NVFUSER_CUDA_SAFE_CALL(cuMemExportToShareableHandle(
       &shared_fd,
       local_handle,
@@ -475,9 +478,10 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     NVF_CHECK(
         sender_rank >= 0 && sender_rank < world_size_, "Invalid sender rank");
 
-    CUmemGenericAllocationHandle peer_handle;
+    CUmemGenericAllocationHandle peer_handle = 0;
     NVFUSER_CUDA_SAFE_CALL(cuMemImportFromShareableHandle(
         &peer_handle,
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<void*>(static_cast<uint64_t>(local_fd)),
         CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
 
@@ -517,12 +521,15 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 #ifdef NVFUSER_DISTRIBUTED
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
-        rank, local_tensor_.sizes(), local_tensor_.scalar_type());
+        static_cast<int>(rank),
+        local_tensor_.sizes(),
+        local_tensor_.scalar_type());
   }
 #endif
 
   NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
   return at::from_blob(
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       reinterpret_cast<void*>(remote_ptrs_[rank]),
       local_tensor_.sizes(),
       local_tensor_.strides(),
@@ -557,7 +564,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   const int64_t world_size = comm.size();
 
   size_t total_size = aligned_size_ * world_size;
-  CUdeviceptr base;
+  CUdeviceptr base = 0;
   NVFUSER_CUDA_SAFE_CALL(
       cuMemAddressReserve(&base, total_size, granularity_, 0, 0));
 
@@ -583,13 +590,15 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   NVF_CHECK(
       aligned_size_ % local_tensor_.element_size() == 0,
       "Aligned size must be divisible by element size");
-  strides.push_back(aligned_size_ / local_tensor_.element_size());
+  strides.push_back(
+      static_cast<int64_t>(aligned_size_ / local_tensor_.element_size()));
   for (int64_t s : local_tensor_.strides()) {
     strides.push_back(s);
   }
 
   size_t map_size = aligned_size_;
   contiguous_view_ = at::from_blob(
+      // NOLINTNEXTLINE(performance-no-int-to-ptr)
       reinterpret_cast<void*>(base),
       sizes,
       strides,
@@ -644,14 +653,14 @@ void SymmetricTensor::setupMulticast(
   const int64_t my_rank = comm.deviceId();
   const int64_t local_rank = comm.local_rank();
 
-  int is_multicast_supported;
+  int is_multicast_supported = 0;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
       &is_multicast_supported,
       CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED,
       local_rank));
   NVF_CHECK(is_multicast_supported, "Multicast not supported");
 
-  exporter_rank_ = exporter_rank;
+  exporter_rank_ = static_cast<int>(exporter_rank);
 
   CUmulticastObjectProp mcast_prop{};
   mcast_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
@@ -682,6 +691,7 @@ void SymmetricTensor::setupMulticast(
 
     NVFUSER_CUDA_SAFE_CALL(cuMemImportFromShareableHandle(
         &mcast_handle_,
+        // NOLINTNEXTLINE(performance-no-int-to-ptr)
         reinterpret_cast<void*>(static_cast<uint64_t>(peer_fd_)),
         CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
   } else {
@@ -700,8 +710,8 @@ void SymmetricTensor::setupMulticast(
   NVFUSER_CUDA_SAFE_CALL(cuMulticastAddDevice(mcast_handle_, cu_dev_));
 
   CUdeviceptr local_ptr = remote_ptrs_[my_device_id_];
-  CUdeviceptr base_ptr;
-  size_t base_size;
+  CUdeviceptr base_ptr = 0;
+  size_t base_size = 0;
   NVFUSER_CUDA_SAFE_CALL(
       cuMemGetAddressRange(&base_ptr, &base_size, local_ptr));
   size_t mem_offset = static_cast<size_t>(local_ptr - base_ptr);
@@ -721,7 +731,7 @@ void SymmetricTensor::setupMulticast(
       aligned_size_,
       0));
 
-  CUdeviceptr mc_ptr;
+  CUdeviceptr mc_ptr = 0;
   NVFUSER_CUDA_SAFE_CALL(
       cuMemAddressReserve(&mc_ptr, aligned_size_, granularity_, 0, 0));
   NVFUSER_CUDA_SAFE_CALL(cuMemMap(mc_ptr, aligned_size_, 0, mcast_handle_, 0));
@@ -732,6 +742,7 @@ void SymmetricTensor::setupMulticast(
   access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1));
 
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
   mc_ptr_ = reinterpret_cast<void*>(mc_ptr + offset_diff);
   mc_base_ptr_ = mc_ptr;
   is_multicast_setup_ = true;

From 9b059152218c2b2f2e13431f08e9ff8ca3ef48f9 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 25 Mar 2026 10:38:01 +0200
Subject: [PATCH 10/16] Fixing outdated lint errors

---
 csrc/multidevice/communicator.cpp     |  6 ++++--
 csrc/multidevice/communicator.h       |  2 ++
 csrc/multidevice/ipc_utils.cpp        | 14 +++++++-------
 csrc/multidevice/symmetric_tensor.cpp |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index b818bbb807c..5b956f91245 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -352,12 +352,13 @@ void Communicator::cleanup() {
 
   store_ = nullptr;
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL)
+#if defined(NVFUSER_DISTRIBUTED)
+#if defined(USE_C10D_NCCL)
   // Sort backends to work around a NCCL bug (nvbugs/4889623). Closing backends
   // in different orders between ranks have been causing a hang.
   std::vector<std::pair<std::string, c10::intrusive_ptr<c10d::Backend>>>
       keyed_backends(backends_.begin(), backends_.end());
-  std::sort(keyed_backends.begin(), keyed_backends.end());
+  std::ranges::sort(keyed_backends.begin(), keyed_backends.end());
   for (auto& [key, backend] : keyed_backends) {
     // Call shutdown before destructing a ProcessGroupNCCL as instructed by
     // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170.
@@ -370,6 +371,7 @@ void Communicator::cleanup() {
     c10d::unregister_process_group(entry.first);
   }
   process_groups_.clear();
+#endif
   backends_.clear();
 }
 
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 0b3733505bf..b2ff5c89150 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -159,8 +159,10 @@ class NVF_API Communicator {
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
   // c10d process-group wrappers registered for symmetric-memory rendezvous.
+#ifdef NVFUSER_DISTRIBUTED
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
       process_groups_;
+#endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp
index ab1ad413c87..ae5461e4e70 100644
--- a/csrc/multidevice/ipc_utils.cpp
+++ b/csrc/multidevice/ipc_utils.cpp
@@ -38,7 +38,7 @@ int createIpcSocket(const std::string& path) {
   int sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
   NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno));
 
-  struct sockaddr_un addr {};
+  struct sockaddr_un addr{};
   setupSockAddr(addr, path);
 
   // For abstract namespace, len is usually calculated specifically, but for
@@ -69,7 +69,7 @@ void sendFd(
   int sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
   NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno));
 
-  struct sockaddr_un addr {};
+  struct sockaddr_un addr{};
   setupSockAddr(addr, path);
   socklen_t addrlen = sizeof(addr.sun_family) + path.length();
 
@@ -87,14 +87,14 @@ void sendFd(
     NVF_CHECK(false, "Failed to connect to ", path, ": ", strerror(errno));
   }
 
-  struct msghdr msg {};
+  struct msghdr msg{};
   struct cmsghdr* cmsg = nullptr;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
   char buf[CMSG_SPACE(sizeof(int))];
 
   // If no header data, send at least one byte
   char dummy = '.';
-  struct iovec iov {};
+  struct iovec iov{};
   if (header_data && header_len > 0) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
     iov.iov_base = const_cast<void*>(header_data);
@@ -124,13 +124,13 @@ void sendFd(
 }
 
 int recvFd(int socket_fd, void* header_data, size_t header_len) {
-  struct sockaddr_un client_addr {};
+  struct sockaddr_un client_addr{};
   socklen_t client_len = sizeof(client_addr);
   int client_fd =
       accept(socket_fd, (struct sockaddr*)&client_addr, &client_len);
   NVF_CHECK(client_fd >= 0, "Failed to accept connection: ", strerror(errno));
 
-  struct msghdr msg {};
+  struct msghdr msg{};
   struct cmsghdr* cmsg = nullptr;
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
   char buf[CMSG_SPACE(sizeof(int))];
@@ -139,7 +139,7 @@ int recvFd(int socket_fd, void* header_data, size_t header_len) {
   // Note: recvmsg might return fewer bytes if strict requirements aren't met,
   // but for local unix sockets with small payloads, it usually delivers all.
   char dummy = '.';
-  struct iovec iov {};
+  struct iovec iov{};
   if (header_data && header_len > 0) {
     iov.iov_base = header_data;
     iov.iov_len = header_len;
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index 4af6f917673..d7778122d03 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -228,7 +228,7 @@ at::Tensor SymmetricTensor::allocate(
 
   std::vector<int64_t> strides(sizes.size());
   strides.back() = 1;
-  for (size_t i = strides.size() - 2; i >= 0; --i) {
+  for (int64_t i = static_cast<int64_t>(strides.size()) - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * sizes[i + 1];
   }
 

From 6147139a29396f129737d3bd5e0aa886df48690c Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Wed, 25 Mar 2026 18:43:09 +0200
Subject: [PATCH 11/16] Add torch distributed gaurd

---
 csrc/multidevice/communicator.cpp     |  2 ++
 csrc/multidevice/communicator.h       |  4 ++--
 csrc/multidevice/symmetric_tensor.cpp | 22 +++++++++++-----------
 csrc/multidevice/symmetric_tensor.h   |  4 ++--
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index 5b956f91245..c2bd92c4c26 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -412,6 +412,7 @@ c10d::Backend* Communicator::getBackendForTeam(
 #else
     backends_[team_key] = nullptr;
 #endif
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
     std::optional<c10d::ProcessGroup::BackendType> pg_backend =
         (b == CommunicatorBackend::kNccl)
         ? std::optional<c10d::ProcessGroup::BackendType>(
@@ -432,6 +433,7 @@ c10d::Backend* Communicator::getBackendForTeam(
       c10d::register_process_group(team_key, pg);
       process_groups_[team_key] = std::move(pg);
     }
+#endif
   }
   return backends_.at(team_key).get();
 }
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index b2ff5c89150..85406c4d906 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -11,7 +11,7 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
@@ -159,7 +159,7 @@ class NVF_API Communicator {
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
   // c10d process-group wrappers registered for symmetric-memory rendezvous.
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
       process_groups_;
 #endif
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index d7778122d03..c40dc48e87b 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -15,7 +15,7 @@
 #include "multidevice/ipc_utils.h"
 #include "multidevice/utils.h"
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #endif
@@ -24,7 +24,7 @@ namespace nvfuser {
 
 namespace {
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
   static std::once_flag once;
   std::call_once(once, [backend]() {
@@ -154,7 +154,7 @@ at::Tensor SymmetricTensor::allocate(
     at::Device device) {
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (backend != SymmetricMemoryBackend::Native) {
     const std::string group_name = ensurePyTorchSymmMemBackend(backend);
     std::vector<int64_t> strides(sizes.size());
@@ -181,7 +181,7 @@ at::Tensor SymmetricTensor::allocate(
     NVF_ERROR(
         false,
         "PyTorch symmetric memory backend requires a build with "
-        "NVFUSER_DISTRIBUTED. Use "
+        "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use "
         "NVFUSER_ENABLE=symmetric_memory_backend(native) "
         "or do not set symmetric_memory_backend.");
   }
@@ -317,7 +317,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
       "Expected CUDA tensor, got: ",
       local_tensor.device());
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
     Communicator& comm = Communicator::getInstance();
@@ -369,7 +369,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 }
 
 SymmetricTensor::~SymmetricTensor() {
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return; // PyTorch backend: no native VMM cleanup
   }
@@ -424,7 +424,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   if (are_remote_tensors_setup_ == true) {
     return;
   }
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   // PyTorch backend: perform rendezvous here (lazy, on first
   // setupRemoteHandles).
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
@@ -518,7 +518,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
     return local_tensor_;
   }
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
         static_cast<int>(rank),
@@ -547,7 +547,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   if (is_contiguous_view_setup_) {
     return;
   }
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
@@ -620,7 +620,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 }
 
 at::Tensor SymmetricTensor::getContiguousView() const {
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
@@ -638,7 +638,7 @@ void SymmetricTensor::setupMulticast(
   if (is_multicast_setup_) {
     return;
   }
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
     if (!torch_symm_handle_) {
       setupRemoteHandles(tag);
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index 02e47d8a7fc..a2ab9e2d2ef 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -10,7 +10,7 @@
 #include <ATen/core/Tensor.h>
 #include <cuda.h>
 
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #endif
 
@@ -85,7 +85,7 @@ class SymmetricTensor {
   int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
-#ifdef NVFUSER_DISTRIBUTED
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>
       torch_symm_handle_;
 #endif

From af128e412071af04ddbaec9b9bbb05f10d7f7d47 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 3 Apr 2026 07:48:54 +0300
Subject: [PATCH 12/16] Address pending review cmnts

---
 csrc/multidevice/communicator.cpp     | 41 +++++++++++------------
 csrc/multidevice/communicator.h       |  8 +++--
 csrc/multidevice/symmetric_tensor.cpp | 47 ++++++++++++++++-----------
 csrc/multidevice/symmetric_tensor.h   |  2 +-
 4 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index c2bd92c4c26..b836d9b2394 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -334,6 +334,15 @@ Communicator& Communicator::getInstance() {
   return *communicator;
 }
 
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+void Communicator::registerProcessGroup(
+    const std::string& name,
+    const c10::intrusive_ptr<c10d::ProcessGroup>& pg) {
+  c10d::register_process_group(name, pg);
+  process_groups_[name] = pg;
+}
+#endif
+
 void Communicator::cleanup() {
   static bool cleaned_up = false;
   NVF_CHECK(
@@ -367,10 +376,12 @@ void Communicator::cleanup() {
     }
   }
 #endif
+#if defined(USE_DISTRIBUTED)
   for (const auto& entry : process_groups_) {
     c10d::unregister_process_group(entry.first);
   }
   process_groups_.clear();
+#endif
 #endif
   backends_.clear();
 }
@@ -412,13 +423,10 @@ c10d::Backend* Communicator::getBackendForTeam(
 #else
     backends_[team_key] = nullptr;
 #endif
+  }
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
-    std::optional<c10d::ProcessGroup::BackendType> pg_backend =
-        (b == CommunicatorBackend::kNccl)
-        ? std::optional<c10d::ProcessGroup::BackendType>(
-              c10d::ProcessGroup::BackendType::NCCL)
-        : std::nullopt;
-    if (backends_[team_key] != nullptr && pg_backend.has_value()) {
+  if (process_groups_.find(team_key) == process_groups_.end()) {
+    if (b == CommunicatorBackend::kNccl) {
       auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
       RankType team_rank = std::distance(team.begin(), rank_it);
 
@@ -426,15 +434,17 @@ c10d::Backend* Communicator::getBackendForTeam(
           c10::make_intrusive<c10d::PrefixStore>(team_key, store_),
           team_rank,
           static_cast<int>(team.size()));
-      pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]);
-      pg->setDefaultBackend(*pg_backend);
+      pg->setBackend(
+          c10::DeviceType::CUDA,
+          c10d::ProcessGroup::BackendType::NCCL,
+          backends_[team_key]);
+      pg->setDefaultBackend(c10d::ProcessGroup::BackendType::NCCL);
       pg->setGroupName(team_key);
 
-      c10d::register_process_group(team_key, pg);
-      process_groups_[team_key] = std::move(pg);
+      registerProcessGroup(team_key, pg);
     }
-#endif
   }
+#endif
   return backends_.at(team_key).get();
 }
 
@@ -455,13 +465,4 @@ void Communicator::barrier(std::optional<CommunicatorBackend> backend) {
   getWorld(backend)->barrier(options)->wait();
 }
 
-std::string Communicator::getSymmMemGroupKey(
-    std::optional<CommunicatorBackend> backend) {
-  std::vector<RankType> all_ranks(size_);
-  std::iota(all_ranks.begin(), all_ranks.end(), 0);
-  CommunicatorBackend b = backend.value_or(default_backend_);
-  (void)getBackendForTeam(all_ranks, b);
-  return getTeamKey(all_ranks, b);
-}
-
 } // namespace nvfuser
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 85406c4d906..35391f37580 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -111,9 +111,11 @@ class NVF_API Communicator {
   c10d::Backend* getWorld(
       std::optional<CommunicatorBackend> backend = std::nullopt);
 
-  // Returns the world process-group name for the given backend.
-  std::string getSymmMemGroupKey(
-      std::optional<CommunicatorBackend> backend = std::nullopt);
+#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  void registerProcessGroup(
+      const std::string& name,
+      const c10::intrusive_ptr<c10d::ProcessGroup>& pg);
+#endif
 
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index c40dc48e87b..ad9266b7952 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -57,22 +57,30 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
         comm.isBackendAvailable(CommunicatorBackend::kNccl),
         "NCCL backend is required for non-native symmetric memory backend");
 
-    const std::string group_name =
-        comm.getSymmMemGroupKey(CommunicatorBackend::kNccl);
-
-    static std::once_flag pg0_once;
-    std::call_once(pg0_once, [&]() {
-      try {
-        (void)c10d::resolve_process_group("0");
-      } catch (const std::exception&) {
-        // resolve_process_group throws c10d Error
-        // (derives from std::exception)
-        auto pg = c10d::resolve_process_group(group_name);
-        c10d::register_process_group("0", pg);
-      }
-    });
+    std::vector<RankType> all_ranks(comm.size());
+    std::iota(all_ranks.begin(), all_ranks.end(), 0);
+    (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
+    std::string group_name = std::accumulate(
+        std::begin(all_ranks),
+        std::end(all_ranks),
+        std::string("nccl"),
+        [](const std::string& a, const RankType& b) {
+          return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b);
+        });
+    if (backend == SymmetricMemoryBackend::PyTorchNvshmem) {
+      static std::once_flag pg0_once;
+      std::call_once(pg0_once, [&]() {
+        try {
+          (void)c10d::resolve_process_group("0");
+        } catch (const std::exception&) {
+          // resolve_process_group throws c10d Error
+          // (derives from std::exception)
+          auto pg = c10d::resolve_process_group(group_name);
+          comm.registerProcessGroup("0", pg);
+        }
+      });
+    }
 
-    comm.barrier(CommunicatorBackend::kNccl);
     return group_name;
   }
 
@@ -424,23 +432,24 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
   if (are_remote_tensors_setup_ == true) {
     return;
   }
+  Communicator& comm = Communicator::getInstance();
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   // PyTorch backend: perform rendezvous here (lazy, on first
   // setupRemoteHandles).
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
     const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+    comm.barrier(CommunicatorBackend::kNccl);
     torch_symm_handle_ =
         c10d::symmetric_memory::rendezvous(local_tensor_, group_name);
     are_remote_tensors_setup_ = true;
     if (torch_symm_handle_->has_multicast_support()) {
       is_multicast_setup_ = true;
-      mc_ptr_ = torch_symm_handle_->get_multicast_ptr();
+      multicast_ptr_ = torch_symm_handle_->get_multicast_ptr();
     }
     return;
   }
 #endif
-  Communicator& comm = Communicator::getInstance();
   CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_];
   CUdeviceptr local_ptr = remote_ptrs_[my_device_id_];
 
@@ -540,7 +549,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
 
 void* SymmetricTensor::multicastPtr() const {
   NVF_CHECK(is_multicast_setup_, "Multicast not setup");
-  return mc_ptr_;
+  return multicast_ptr_;
 }
 
 void SymmetricTensor::setupContiguousView(const std::string& tag) {
@@ -743,7 +752,7 @@ void SymmetricTensor::setupMulticast(
   NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1));
 
   // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  mc_ptr_ = reinterpret_cast<void*>(mc_ptr + offset_diff);
+  multicast_ptr_ = reinterpret_cast<void*>(mc_ptr + offset_diff);
   mc_base_ptr_ = mc_ptr;
   is_multicast_setup_ = true;
 
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index a2ab9e2d2ef..92af0d4f5f6 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -79,7 +79,7 @@ class SymmetricTensor {
   bool is_multicast_setup_ = false;
   CUmemGenericAllocationHandle mcast_handle_{};
   CUdevice cu_dev_{};
-  void* mc_ptr_{nullptr};
+  void* multicast_ptr_{nullptr};
   CUdeviceptr mc_base_ptr_{0};
   int exporter_rank_{-1};
   int peer_fd_{-1};

From 828573d3b4af61f4b84da292ee901f4c1cb14492 Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 10 Apr 2026 14:33:42 +0300
Subject: [PATCH 13/16] Add mocks for c10d

---
 csrc/multidevice/c10d_mock.h          |  58 ++++++++++++-
 csrc/multidevice/communicator.cpp     |  21 +++--
 csrc/multidevice/communicator.h       |   4 +-
 csrc/multidevice/symmetric_tensor.cpp | 120 ++++++++++++--------------
 csrc/multidevice/symmetric_tensor.h   |   4 +-
 5 files changed, 130 insertions(+), 77 deletions(-)

diff --git a/csrc/multidevice/c10d_mock.h b/csrc/multidevice/c10d_mock.h
index 33a061e1481..7a392158ace 100644
--- a/csrc/multidevice/c10d_mock.h
+++ b/csrc/multidevice/c10d_mock.h
@@ -20,8 +20,12 @@
 
 #pragma once
 
+#include <ATen/ATen.h>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/ivalue.h>
+#include <c10/core/Device.h>
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
 #include <c10/util/intrusive_ptr.h>
 
 namespace c10d {
@@ -34,7 +38,7 @@ class Work : public torch::CustomClassHolder {
 };
 
 struct ReduceOp : torch::CustomClassHolder {
-  enum RedOpType {
+  enum RedOpType : std::uint8_t {
     SUM,
     AVG,
     PRODUCT,
@@ -211,4 +215,56 @@ class TCPStore : public torch::CustomClassHolder {
   }
 };
 
+class ProcessGroup : public torch::CustomClassHolder {
+ public:
+};
+
+inline c10::intrusive_ptr<ProcessGroup> resolve_process_group(
+    const std::string&) {
+  return c10::make_intrusive<ProcessGroup>();
+}
+
+inline void register_process_group(
+    const std::string&,
+    const c10::intrusive_ptr<ProcessGroup>&) {}
+
 } // namespace c10d
+
+namespace c10d::symmetric_memory {
+
+class SymmetricMemory : public torch::CustomClassHolder {
+ public:
+  ~SymmetricMemory() override = default;
+  virtual bool has_multicast_support() {
+    return false;
+  }
+  virtual void* get_multicast_ptr() {
+    return nullptr;
+  }
+  at::Tensor get_remote_tensor(
+      int peer,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype) {
+    return at::empty(sizes, at::TensorOptions().dtype(dtype));
+  }
+};
+
+inline void set_backend(const std::string&) {}
+
+inline at::Tensor empty_strided_p2p(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::ScalarType dtype,
+    c10::Device device,
+    const std::optional<std::string>& group_name,
+    std::optional<uint64_t>) {
+  return at::empty(size, at::TensorOptions().dtype(dtype));
+}
+
+inline c10::intrusive_ptr<SymmetricMemory> rendezvous(
+    const at::Tensor&,
+    const std::optional<std::string>& = std::nullopt) {
+  return c10::make_intrusive<SymmetricMemory>();
+}
+
+} // namespace c10d::symmetric_memory
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index b836d9b2394..ad953c1cffa 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -334,14 +334,14 @@ Communicator& Communicator::getInstance() {
   return *communicator;
 }
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 void Communicator::registerProcessGroup(
     const std::string& name,
     const c10::intrusive_ptr<c10d::ProcessGroup>& pg) {
   c10d::register_process_group(name, pg);
   process_groups_[name] = pg;
 }
-#endif
+// #endif
 
 void Communicator::cleanup() {
   static bool cleaned_up = false;
@@ -402,16 +402,20 @@ c10d::Backend* Communicator::getBackendForTeam(
   // generate a string key which is unique to the team
   // create the team and cache it
   std::string team_key = prefix + getTeamKey(team, b);
+  auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
+  if (rank_it == team.end()) {
+    return nullptr;
+  }
   // check if backend associated with the team is present in the cache
   if (backends_.find(team_key) ==
       backends_.end()) { // create the backend and cache it
 #ifdef NVFUSER_DISTRIBUTED
     backends_[team_key] = [&]() -> c10::intrusive_ptr<c10d::Backend> {
       // check that the caller's rank belongs to the requested team
-      auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
-      if (rank_it == team.end()) {
-        return nullptr;
-      }
+      // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
+      // if (rank_it == team.end()) {
+      //   return nullptr;
+      // }
       // retrieve the caller's rank index/position in the team
       RankType team_rank = std::distance(team.begin(), rank_it);
       return createBackend(
@@ -427,7 +431,10 @@ c10d::Backend* Communicator::getBackendForTeam(
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (process_groups_.find(team_key) == process_groups_.end()) {
     if (b == CommunicatorBackend::kNccl) {
-      auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
+      // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
+      // if (rank_it == team.end()) {
+      //   return nullptr;
+      // }
       RankType team_rank = std::distance(team.begin(), rank_it);
 
       auto pg = c10::make_intrusive<c10d::ProcessGroup>(
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 35391f37580..72017dde4a1 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -111,11 +111,11 @@ class NVF_API Communicator {
   c10d::Backend* getWorld(
       std::optional<CommunicatorBackend> backend = std::nullopt);
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   void registerProcessGroup(
       const std::string& name,
       const c10::intrusive_ptr<c10d::ProcessGroup>& pg);
-#endif
+  // #endif
 
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index ad9266b7952..68cf874e6ec 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -24,8 +24,8 @@ namespace nvfuser {
 
 namespace {
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
-std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
+// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) {
   static std::once_flag once;
   std::call_once(once, [backend]() {
     const char* name = nullptr;
@@ -51,45 +51,37 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) {
   NVF_CHECK(
       comm.is_available(), "Communicator not available for symmetric memory");
 
-  // Always return a valid group name
-  if (backend != SymmetricMemoryBackend::Native) {
-    NVF_CHECK(
-        comm.isBackendAvailable(CommunicatorBackend::kNccl),
-        "NCCL backend is required for non-native symmetric memory backend");
-
-    std::vector<RankType> all_ranks(comm.size());
-    std::iota(all_ranks.begin(), all_ranks.end(), 0);
-    (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
-    std::string group_name = std::accumulate(
-        std::begin(all_ranks),
-        std::end(all_ranks),
-        std::string("nccl"),
-        [](const std::string& a, const RankType& b) {
-          return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b);
-        });
-    if (backend == SymmetricMemoryBackend::PyTorchNvshmem) {
-      static std::once_flag pg0_once;
-      std::call_once(pg0_once, [&]() {
-        try {
-          (void)c10d::resolve_process_group("0");
-        } catch (const std::exception&) {
-          // resolve_process_group throws c10d Error
-          // (derives from std::exception)
-          auto pg = c10d::resolve_process_group(group_name);
-          comm.registerProcessGroup("0", pg);
-        }
+  NVF_CHECK(
+      comm.isBackendAvailable(CommunicatorBackend::kNccl),
+      "NCCL backend is required for non-native symmetric memory backend");
+
+  std::vector<RankType> all_ranks(comm.size());
+  std::iota(all_ranks.begin(), all_ranks.end(), 0);
+  (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl);
+  std::string group_name = std::accumulate(
+      std::begin(all_ranks),
+      std::end(all_ranks),
+      std::string("nccl"),
+      [](const std::string& a, const RankType& b) {
+        return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b);
       });
-    }
-
-    return group_name;
+  if (backend == SymmetricMemoryBackend::PyTorchNvshmem) {
+    static std::once_flag pg0_once;
+    std::call_once(pg0_once, [&]() {
+      try {
+        (void)c10d::resolve_process_group("0");
+      } catch (const std::exception&) {
+        // resolve_process_group throws c10d Error
+        // (derives from std::exception)
+        auto pg = c10d::resolve_process_group(group_name);
+        comm.registerProcessGroup("0", pg);
+      }
+    });
   }
 
-  NVF_ERROR(
-      false,
-      "No c10d backend available for symmetric memory rendezvous. "
-      "Expected NCCL or UCC process group.");
+  return group_name;
 }
-#endif
+// #endif
 
 // Returns the allocation granularity for symmetric memory.
 // - query_mcast_granularity: if true, considers multicast granularity
@@ -162,9 +154,9 @@ at::Tensor SymmetricTensor::allocate(
     at::Device device) {
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (backend != SymmetricMemoryBackend::Native) {
-    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+    const std::string group_name = initSymmMemBackendAndGetGroup(backend);
     std::vector<int64_t> strides(sizes.size());
     strides.back() = 1;
     for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) {
@@ -184,16 +176,14 @@ at::Tensor SymmetricTensor::allocate(
         alloc_group_name,
         /*alloc_id=*/c10::nullopt);
   }
-#else
-  if (backend != SymmetricMemoryBackend::Native) {
-    NVF_ERROR(
-        false,
-        "PyTorch symmetric memory backend requires a build with "
-        "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use "
-        "NVFUSER_ENABLE=symmetric_memory_backend(native) "
-        "or do not set symmetric_memory_backend.");
-  }
-#endif
+  // #else
+  //   NVF_ERROR(
+  //       (backend == SymmetricMemoryBackend::Native),
+  //       "PyTorch symmetric memory backend requires a build with "
+  //       "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use "
+  //       "NVFUSER_ENABLE=symmetric_memory_backend(native) "
+  //       "or do not set symmetric_memory_backend.");
+  // #endif
 
   int is_vmm_supported = 0;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
@@ -236,7 +226,7 @@ at::Tensor SymmetricTensor::allocate(
 
   std::vector<int64_t> strides(sizes.size());
   strides.back() = 1;
-  for (int64_t i = static_cast<int64_t>(strides.size()) - 2; i >= 0; --i) {
+  for (int64_t i = std::ssize(strides) - 2; i >= 0; --i) {
     strides[i] = strides[i + 1] * sizes[i + 1];
   }
 
@@ -325,7 +315,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
       "Expected CUDA tensor, got: ",
       local_tensor.device());
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
     Communicator& comm = Communicator::getInstance();
@@ -334,7 +324,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
     requested_size_ = local_tensor.numel() * local_tensor.element_size();
     return;
   }
-#endif
+  // #endif
 
   std::string error = SymmetricTensor::validate(local_tensor);
   NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error);
@@ -377,11 +367,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 }
 
 SymmetricTensor::~SymmetricTensor() {
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return; // PyTorch backend: no native VMM cleanup
   }
-#endif
+  // #endif
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     if (mc_base_ptr_) {
@@ -433,12 +423,12 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     return;
   }
   Communicator& comm = Communicator::getInstance();
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   // PyTorch backend: perform rendezvous here (lazy, on first
   // setupRemoteHandles).
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
-    const std::string group_name = ensurePyTorchSymmMemBackend(backend);
+    const std::string group_name = initSymmMemBackendAndGetGroup(backend);
     comm.barrier(CommunicatorBackend::kNccl);
     torch_symm_handle_ =
         c10d::symmetric_memory::rendezvous(local_tensor_, group_name);
@@ -449,7 +439,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     }
     return;
   }
-#endif
+  // #endif
   CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_];
   CUdeviceptr local_ptr = remote_ptrs_[my_device_id_];
 
@@ -527,14 +517,14 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
     return local_tensor_;
   }
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
         static_cast<int>(rank),
         local_tensor_.sizes(),
         local_tensor_.scalar_type());
   }
-#endif
+  // #endif
 
   NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
   return at::from_blob(
@@ -556,14 +546,14 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   if (is_contiguous_view_setup_) {
     return;
   }
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
         "backend."
         "Use native backend for SymmetricContiguousView.");
   }
-#endif
+  // #endif
   NVF_CHECK(
       are_remote_tensors_setup_ == true,
       "Remote tensors must be setup before setupContiguousView");
@@ -629,13 +619,13 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 }
 
 at::Tensor SymmetricTensor::getContiguousView() const {
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
         "backend.");
   }
-#endif
+  // #endif
   NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup");
   return contiguous_view_;
 }
@@ -647,7 +637,7 @@ void SymmetricTensor::setupMulticast(
   if (is_multicast_setup_) {
     return;
   }
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
     if (!torch_symm_handle_) {
       setupRemoteHandles(tag);
@@ -657,7 +647,7 @@ void SymmetricTensor::setupMulticast(
     }
     return;
   }
-#endif
+  // #endif
   Communicator& comm = Communicator::getInstance();
   const int64_t my_rank = comm.deviceId();
   const int64_t local_rank = comm.local_rank();
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index 92af0d4f5f6..888472f2b46 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -85,10 +85,10 @@ class SymmetricTensor {
   int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>
       torch_symm_handle_;
-#endif
+  // #endif
 };
 
 } // namespace nvfuser

From 6a5d3c3aa9eef9afff785c34fa2914dd25b7982d Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 10 Apr 2026 15:18:15 +0300
Subject: [PATCH 14/16] Fix missing guard for process_groups

---
 csrc/multidevice/communicator.h       |  4 ++--
 csrc/multidevice/symmetric_tensor.cpp | 15 +++++++--------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 72017dde4a1..9fbbde89a4e 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -161,10 +161,10 @@ class NVF_API Communicator {
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
   // c10d process-group wrappers registered for symmetric-memory rendezvous.
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
       process_groups_;
-#endif
+  // #endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index e1f2204fc5a..e262a7a89c9 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -745,21 +745,20 @@ void SymmetricTensor::setupMulticast(
       aligned_size_,
       0));
 
-  CUdeviceptr multicast_ptr_ = 0;
+  CUdeviceptr mc_ptr = 0;
   NVFUSER_CUDA_SAFE_CALL(
-      cuMemAddressReserve(&multicast_ptr_, aligned_size_, granularity_, 0, 0));
-  NVFUSER_CUDA_SAFE_CALL(
-      cuMemMap(multicast_ptr_, aligned_size_, 0, mcast_handle_, 0));
+      cuMemAddressReserve(&mc_ptr, aligned_size_, granularity_, 0, 0));
+  NVFUSER_CUDA_SAFE_CALL(cuMemMap(mc_ptr, aligned_size_, 0, mcast_handle_, 0));
 
   CUmemAccessDesc access{};
   access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   access.location.id = static_cast<int>(local_rank);
   access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  NVFUSER_CUDA_SAFE_CALL(
-      cuMemSetAccess(multicast_ptr_, aligned_size_, &access, 1));
+  NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1));
 
-  multicast_ptr_ = multicast_ptr_ + offset_diff;
-  mc_base_ptr_ = multicast_ptr_;
+  // NOLINTNEXTLINE(performance-no-int-to-ptr)
+  multicast_ptr_ = reinterpret_cast<void*>(mc_ptr + offset_diff);
+  mc_base_ptr_ = mc_ptr;
   is_multicast_setup_ = true;
 
   comm.barrier();

From 2908e70f74ac191865923111841b68ca5083b31e Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 10 Apr 2026 16:05:08 +0300
Subject: [PATCH 15/16] include mock header for non distributed build

---
 csrc/multidevice/symmetric_tensor.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index 3ddc5955a0c..c323b045c9c 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -12,6 +12,8 @@
 
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+#else
+#include "multidevice/c10d_mock.h"
 #endif
 
 namespace nvfuser {

From aa4f9c1628c1a201e38dd7203b698fa56d27071c Mon Sep 17 00:00:00 2001
From: saivishal1999 <spothula@nvidia.com>
Date: Fri, 10 Apr 2026 18:04:17 +0300
Subject: [PATCH 16/16] Remove guard comments

---
 csrc/multidevice/c10d_mock.h          | 14 +++++++-----
 csrc/multidevice/communicator.cpp     | 15 +------------
 csrc/multidevice/communicator.h       |  7 ++----
 csrc/multidevice/symmetric_tensor.cpp | 32 +--------------------------
 csrc/multidevice/symmetric_tensor.h   |  3 +--
 5 files changed, 13 insertions(+), 58 deletions(-)

diff --git a/csrc/multidevice/c10d_mock.h b/csrc/multidevice/c10d_mock.h
index 7a392158ace..bc51fe02dfb 100644
--- a/csrc/multidevice/c10d_mock.h
+++ b/csrc/multidevice/c10d_mock.h
@@ -220,13 +220,15 @@ class ProcessGroup : public torch::CustomClassHolder {
 };
 
 inline c10::intrusive_ptr<ProcessGroup> resolve_process_group(
-    const std::string&) {
+    const std::string& group_name) {
   return c10::make_intrusive<ProcessGroup>();
 }
 
 inline void register_process_group(
-    const std::string&,
-    const c10::intrusive_ptr<ProcessGroup>&) {}
+    const std::string& group_name,
+    const c10::intrusive_ptr<ProcessGroup>& group) {}
+
+inline void unregister_process_group(const std::string& group_name) {}
 
 } // namespace c10d
 
@@ -257,13 +259,13 @@ inline at::Tensor empty_strided_p2p(
     c10::ScalarType dtype,
     c10::Device device,
     const std::optional<std::string>& group_name,
-    std::optional<uint64_t>) {
+    std::optional<uint64_t> alloc_id) {
   return at::empty(size, at::TensorOptions().dtype(dtype));
 }
 
 inline c10::intrusive_ptr<SymmetricMemory> rendezvous(
-    const at::Tensor&,
-    const std::optional<std::string>& = std::nullopt) {
+    const at::Tensor& tensor,
+    const std::optional<std::string>& group_name = std::nullopt) {
   return c10::make_intrusive<SymmetricMemory>();
 }
 
diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp
index ad953c1cffa..afe2ff1308d 100644
--- a/csrc/multidevice/communicator.cpp
+++ b/csrc/multidevice/communicator.cpp
@@ -16,7 +16,6 @@
 #ifdef NVFUSER_DISTRIBUTED
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/PrefixStore.hpp>
-#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/exception.h>
 #ifdef USE_C10D_NCCL
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
@@ -334,14 +333,12 @@ Communicator& Communicator::getInstance() {
   return *communicator;
 }
 
-// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 void Communicator::registerProcessGroup(
     const std::string& name,
     const c10::intrusive_ptr<c10d::ProcessGroup>& pg) {
   c10d::register_process_group(name, pg);
   process_groups_[name] = pg;
 }
-// #endif
 
 void Communicator::cleanup() {
   static bool cleaned_up = false;
@@ -376,12 +373,10 @@ void Communicator::cleanup() {
     }
   }
 #endif
-#if defined(USE_DISTRIBUTED)
   for (const auto& entry : process_groups_) {
     c10d::unregister_process_group(entry.first);
   }
   process_groups_.clear();
-#endif
 #endif
   backends_.clear();
 }
@@ -402,6 +397,7 @@ c10d::Backend* Communicator::getBackendForTeam(
   // generate a string key which is unique to the team
   // create the team and cache it
   std::string team_key = prefix + getTeamKey(team, b);
+  // check that the caller's rank belongs to the requested team
   auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
   if (rank_it == team.end()) {
     return nullptr;
@@ -411,11 +407,6 @@ c10d::Backend* Communicator::getBackendForTeam(
       backends_.end()) { // create the backend and cache it
 #ifdef NVFUSER_DISTRIBUTED
     backends_[team_key] = [&]() -> c10::intrusive_ptr<c10d::Backend> {
-      // check that the caller's rank belongs to the requested team
-      // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
-      // if (rank_it == team.end()) {
-      //   return nullptr;
-      // }
       // retrieve the caller's rank index/position in the team
       RankType team_rank = std::distance(team.begin(), rank_it);
       return createBackend(
@@ -431,10 +422,6 @@ c10d::Backend* Communicator::getBackendForTeam(
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (process_groups_.find(team_key) == process_groups_.end()) {
     if (b == CommunicatorBackend::kNccl) {
-      // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId());
-      // if (rank_it == team.end()) {
-      //   return nullptr;
-      // }
       RankType team_rank = std::distance(team.begin(), rank_it);
 
       auto pg = c10::make_intrusive<c10d::ProcessGroup>(
diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h
index 9fbbde89a4e..f19707334b2 100644
--- a/csrc/multidevice/communicator.h
+++ b/csrc/multidevice/communicator.h
@@ -11,7 +11,7 @@
 #include <ATen/core/ivalue.h>
 #include <c10/util/intrusive_ptr.h>
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+#if defined(NVFUSER_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
@@ -111,11 +111,9 @@ class NVF_API Communicator {
   c10d::Backend* getWorld(
       std::optional<CommunicatorBackend> backend = std::nullopt);
 
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   void registerProcessGroup(
       const std::string& name,
       const c10::intrusive_ptr<c10d::ProcessGroup>& pg);
-  // #endif
 
   // returns if a backend is available for creation
   bool isBackendAvailable(CommunicatorBackend backend) const {
@@ -161,10 +159,9 @@ class NVF_API Communicator {
   // cache for the created backends. The keys are strings generated from Teams
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::Backend>> backends_;
   // c10d process-group wrappers registered for symmetric-memory rendezvous.
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+  // Keeps track of the process groups created for the rendezvous.
   std::unordered_map<std::string, c10::intrusive_ptr<c10d::ProcessGroup>>
       process_groups_;
-  // #endif
 };
 
 } // namespace nvfuser
diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp
index e262a7a89c9..862a40045e3 100644
--- a/csrc/multidevice/symmetric_tensor.cpp
+++ b/csrc/multidevice/symmetric_tensor.cpp
@@ -15,16 +15,10 @@
 #include "multidevice/ipc_utils.h"
 #include "multidevice/utils.h"
 
-#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
-#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
-#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
-#endif
-
 namespace nvfuser {
 
 namespace {
 
-// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
 std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) {
   static std::once_flag once;
   std::call_once(once, [backend]() {
@@ -53,7 +47,7 @@ std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) {
 
   NVF_CHECK(
       comm.isBackendAvailable(CommunicatorBackend::kNccl),
-      "NCCL backend is required for non-native symmetric memory backend");
+      "kNccl backend is required for non-native symmetric memory backend");
 
   std::vector<RankType> all_ranks(comm.size());
   std::iota(all_ranks.begin(), all_ranks.end(), 0);
@@ -81,7 +75,6 @@ std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) {
 
   return group_name;
 }
-// #endif
 
 // Returns the allocation granularity for symmetric memory.
 // - query_mcast_granularity: if true, considers multicast granularity
@@ -154,7 +147,6 @@ at::Tensor SymmetricTensor::allocate(
     at::Device device) {
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
 
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (backend != SymmetricMemoryBackend::Native) {
     const std::string group_name = initSymmMemBackendAndGetGroup(backend);
     std::vector<int64_t> strides(sizes.size());
@@ -176,14 +168,6 @@ at::Tensor SymmetricTensor::allocate(
         alloc_group_name,
         /*alloc_id=*/c10::nullopt);
   }
-  // #else
-  //   NVF_ERROR(
-  //       (backend == SymmetricMemoryBackend::Native),
-  //       "PyTorch symmetric memory backend requires a build with "
-  //       "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use "
-  //       "NVFUSER_ENABLE=symmetric_memory_backend(native) "
-  //       "or do not set symmetric_memory_backend.");
-  // #endif
 
   int is_vmm_supported = 0;
   NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute(
@@ -314,7 +298,6 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
       "Expected CUDA tensor, got: ",
       local_tensor.device());
 
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
   if (backend != SymmetricMemoryBackend::Native) {
     Communicator& comm = Communicator::getInstance();
@@ -323,7 +306,6 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
     requested_size_ = local_tensor.numel() * local_tensor.element_size();
     return;
   }
-  // #endif
 
   std::string error = SymmetricTensor::validate(local_tensor);
   NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error);
@@ -367,11 +349,9 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor)
 }
 
 SymmetricTensor::~SymmetricTensor() {
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return; // PyTorch backend: no native VMM cleanup
   }
-  // #endif
 #if (CUDA_VERSION >= 13000)
   if (is_multicast_setup_) {
     if (mc_base_ptr_) {
@@ -423,7 +403,6 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     return;
   }
   Communicator& comm = Communicator::getInstance();
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   // PyTorch backend: perform rendezvous here (lazy, on first
   // setupRemoteHandles).
   SymmetricMemoryBackend backend = getSymmetricMemoryBackend();
@@ -439,7 +418,6 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) {
     }
     return;
   }
-  // #endif
   CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_];
   CUdeviceptr local_ptr = remote_ptrs_[my_device_id_];
 
@@ -533,14 +511,12 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const {
     return local_tensor_;
   }
 
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     return torch_symm_handle_->get_remote_tensor(
         static_cast<int>(rank),
         local_tensor_.sizes(),
         local_tensor_.scalar_type());
   }
-  // #endif
 
   NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup");
   return at::from_blob(
@@ -562,14 +538,12 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
   if (is_contiguous_view_setup_) {
     return;
   }
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
         "backend."
         "Use native backend for SymmetricContiguousView.");
   }
-  // #endif
   NVF_CHECK(
       are_remote_tensors_setup_ == true,
       "Remote tensors must be setup before setupContiguousView");
@@ -634,13 +608,11 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) {
 }
 
 at::Tensor SymmetricTensor::getContiguousView() const {
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (torch_symm_handle_) {
     NVF_THROW(
         "Contiguous view is not yet supported for PyTorch symmetric memory "
         "backend.");
   }
-  // #endif
   NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup");
   return contiguous_view_;
 }
@@ -652,7 +624,6 @@ void SymmetricTensor::setupMulticast(
   if (is_multicast_setup_) {
     return;
   }
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) {
     if (!torch_symm_handle_) {
       setupRemoteHandles(tag);
@@ -662,7 +633,6 @@ void SymmetricTensor::setupMulticast(
     }
     return;
   }
-  // #endif
   Communicator& comm = Communicator::getInstance();
   const int64_t my_rank = comm.deviceId();
   const int64_t local_rank = comm.local_rank();
diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h
index c323b045c9c..ae10cb3dcbc 100644
--- a/csrc/multidevice/symmetric_tensor.h
+++ b/csrc/multidevice/symmetric_tensor.h
@@ -11,6 +11,7 @@
 #include <cuda.h>
 
 #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
+#include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
 #else
 #include "multidevice/c10d_mock.h"
@@ -94,10 +95,8 @@ class SymmetricTensor {
   int peer_fd_{-1};
   bool is_contiguous_view_setup_ = false;
   at::Tensor contiguous_view_;
-  // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED)
   c10::intrusive_ptr<c10d::symmetric_memory::SymmetricMemory>
       torch_symm_handle_;
-  // #endif
 };
 
 } // namespace nvfuser