From 14fd212fa6ed76fa541f2e4d405675d6dc61e353 Mon Sep 17 00:00:00 2001 From: Sai Vishal Pothula Date: Fri, 27 Feb 2026 04:40:02 +0200 Subject: [PATCH 01/16] Initial implementation of symmetric memory backend for PyTorch --- csrc/multidevice/communicator.cpp | 16 ++ csrc/multidevice/communicator.h | 13 ++ csrc/multidevice/ipc_utils.cpp | 18 ++ csrc/multidevice/ipc_utils.h | 13 ++ csrc/multidevice/symmetric_tensor.cpp | 163 +++++++++++++++++- csrc/multidevice/symmetric_tensor.h | 21 ++- csrc/options.cpp | 1 + csrc/options.h | 2 + fbuild.sh | 24 +++ .../cpp/test_multidevice_symmetric_tensor.cpp | 108 ++++++++++++ 10 files changed, 372 insertions(+), 7 deletions(-) create mode 100755 fbuild.sh diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index dbd65ba4610..7531f7ff186 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -424,4 +424,20 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } +#ifdef NVFUSER_DISTRIBUTED +c10::intrusive_ptr Communicator::getStore() const { + return c10::intrusive_ptr(store_); +} + +c10::intrusive_ptr Communicator::getWorldBackendIntrusivePtr( + std::optional backend) { + std::vector all_ranks(size_); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + CommunicatorBackend b = backend.value_or(default_backend_); + std::string team_key = getTeamKey(all_ranks, b); + (void)getBackendForTeam(all_ranks, backend, ""); + return backends_.at(team_key); +} +#endif + } // namespace nvfuser diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index b56e6fee3aa..45a7f9e4f4f 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -13,6 +13,7 @@ #ifdef NVFUSER_DISTRIBUTED #include +#include #include #include #else @@ -124,6 +125,18 @@ class NVF_API Communicator { return store_.get(); } +#ifdef NVFUSER_DISTRIBUTED + // Returns the store as an intrusive_ptr for use with PyTorch symmetric + // memory (c10d::symmetric_memory::set_group_info). + c10::intrusive_ptr getStore() const; + + // Returns the world backend as an intrusive_ptr so it can be registered with + // c10d::register_process_group (e.g. for PyTorch symmetric memory NCCL + // rendezvous, which resolves the group by name). + c10::intrusive_ptr getWorldBackendIntrusivePtr( + std::optional backend = std::nullopt); +#endif + private: Communicator( CommunicatorBackend backend = comm_backend_default, diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp index 656b4ee5e24..01bdf949044 100644 --- a/csrc/multidevice/ipc_utils.cpp +++ b/csrc/multidevice/ipc_utils.cpp @@ -191,4 +191,22 @@ MulticastProtocol getMulticastProtocol() { return MulticastProtocol::BatchMemcpy; } +SymmetricMemoryBackend getSymmetricMemoryBackend() { + if (isOptionEnabled(EnableOption::SymmetricMemoryBackend)) { + if (hasEnableOptionArgument( + EnableOption::SymmetricMemoryBackend, "pytorch_nccl")) { + return SymmetricMemoryBackend::PyTorchNccl; + } + if (hasEnableOptionArgument( + EnableOption::SymmetricMemoryBackend, "pytorch_nvshmem")) { + return SymmetricMemoryBackend::PyTorchNvshmem; + } + if (hasEnableOptionArgument( + EnableOption::SymmetricMemoryBackend, "pytorch_cuda")) { + return SymmetricMemoryBackend::PyTorchCuda; + } + } + return SymmetricMemoryBackend::Native; +} + } // namespace nvfuser diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h index bac466d74f8..0cfd6586e47 100644 --- a/csrc/multidevice/ipc_utils.h +++ b/csrc/multidevice/ipc_utils.h @@ -33,6 +33,19 @@ enum class MulticastProtocol { Memcpy, Multimem, BatchMemcpy }; MulticastProtocol getMulticastProtocol(); +// Backend for symmetric memory allocation and rendezvous. +// Native: Fuser's own CUDA VMM + IPC implementation (default, maintained). +// PyTorch*: Use PyTorch's symmetric memory (torch.distributed._symmetric_memory) +// with the given transport backend (Nccl, Nvshmem, or Cuda). +enum class SymmetricMemoryBackend { + Native, + PyTorchNccl, + PyTorchNvshmem, + PyTorchCuda, +}; + +SymmetricMemoryBackend getSymmetricMemoryBackend(); + // Creates a listening Unix domain socket bound to path. // If path starts with '@', it uses the abstract namespace (replaced with \0). // Returns the socket file descriptor. diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 902ec5a3a32..6f8c3330515 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -7,6 +7,7 @@ // clang-format on #include "multidevice/symmetric_tensor.h" +#include #include #include "cuda_utils.h" @@ -15,10 +16,63 @@ #include "multidevice/ipc_utils.h" #include "multidevice/utils.h" +#ifdef NVFUSER_DISTRIBUTED +#include +#endif + namespace nvfuser { namespace { +#ifdef NVFUSER_DISTRIBUTED +const char* kPyTorchSymmMemGroupName = "nvfuser_symm"; + +// Cache: tensor storage data ptr -> PyTorch SymmetricMemory handle from +// rendezvous. Used so SymmetricTensor(tensor) can recover the handle. +std::unordered_map>& +getPySymmHandleCache() { + static std::unordered_map< + void*, + c10::intrusive_ptr> + cache; + return cache; +} + +std::mutex& getPySymmHandleCacheMutex() { + static std::mutex m; + return m; +} + +void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { + static std::once_flag once; + std::call_once(once, [backend]() { + const char* name = nullptr; + switch (backend) { + case SymmetricMemoryBackend::PyTorchNccl: + name = "NCCL"; + break; + case SymmetricMemoryBackend::PyTorchNvshmem: + name = "NVSHMEM"; + break; + case SymmetricMemoryBackend::PyTorchCuda: + name = "CUDA"; + break; + default: + NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); + } + c10d::symmetric_memory::set_backend(name); + Communicator& comm = Communicator::getInstance(); + NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory"); + c10d::symmetric_memory::set_group_info( + kPyTorchSymmMemGroupName, + static_cast(comm.deviceId()), + static_cast(comm.size()), + comm.getStore()); + }); +} +#endif + + // Returns the allocation granularity for symmetric memory. // - query_mcast_granularity: if true, considers multicast granularity // - query_mcast_recommended_granularity: if true, uses recommended (larger) @@ -88,6 +142,48 @@ at::Tensor SymmetricTensor::allocate( at::IntArrayRef sizes, at::ScalarType dtype, at::Device device) { + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + +#ifdef NVFUSER_DISTRIBUTED + if (backend != SymmetricMemoryBackend::Native) { + ensurePyTorchSymmMemBackend(backend); + std::vector strides(sizes.size()); + strides.back() = 1; + for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name. + c10::optional alloc_group_name = + (backend == SymmetricMemoryBackend::PyTorchNccl) + ? c10::nullopt + : c10::optional(kPyTorchSymmMemGroupName); + at::Tensor tensor = c10d::symmetric_memory::empty_strided_p2p( + sizes, + strides, + dtype, + device, + alloc_group_name, + c10::nullopt); + c10::intrusive_ptr handle = + c10d::symmetric_memory::rendezvous( + tensor, c10::optional(kPyTorchSymmMemGroupName)); + void* key = tensor.storage().data_ptr().get(); + { + std::lock_guard lock(getPySymmHandleCacheMutex()); + getPySymmHandleCache()[key] = handle; + } + return tensor; + } +#else + if (backend != SymmetricMemoryBackend::Native) { + NVF_ERROR( + false, + "PyTorch symmetric memory backend requires a build with " + "NVFUSER_DISTRIBUTED. Use NVFUSER_ENABLE=symmetric_memory_backend(native) " + "or do not set symmetric_memory_backend."); + } +#endif + int is_vmm_supported; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( &is_vmm_supported, @@ -212,6 +308,28 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) "Expected CUDA tensor, got: ", local_tensor.device()); +#ifdef NVFUSER_DISTRIBUTED + { + void* key = local_tensor.storage().data_ptr().get(); + std::lock_guard lock(getPySymmHandleCacheMutex()); + auto& cache = getPySymmHandleCache(); + auto it = cache.find(key); + if (it != cache.end()) { + py_symm_handle_ = std::move(it->second); + cache.erase(it); + world_size_ = py_symm_handle_->get_world_size(); + my_device_id_ = py_symm_handle_->get_rank(); + requested_size_ = local_tensor.numel() * local_tensor.element_size(); + are_remote_tensors_setup_ = true; // PyTorch rendezvous already set up + if (py_symm_handle_->has_multicast_support()) { + is_multicast_setup_ = true; + mc_ptr_ = py_symm_handle_->get_multicast_ptr(); + } + return; + } + } +#endif + std::string error = SymmetricTensor::validate(local_tensor); NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error); @@ -253,6 +371,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) } SymmetricTensor::~SymmetricTensor() { +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + return; // PyTorch backend: no native VMM cleanup + } +#endif #if (CUDA_VERSION >= 13000) if (is_multicast_setup_) { if (mc_base_ptr_) { @@ -302,6 +425,11 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { if (are_remote_tensors_setup_ == true) { return; } +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + return; // PyTorch backend: rendezvous already established remote access + } +#endif Communicator& comm = Communicator::getInstance(); CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_]; CUdeviceptr local_ptr = remote_ptrs_[my_device_id_]; @@ -379,6 +507,13 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { return local_tensor_; } +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + return py_symm_handle_->get_remote_tensor( + rank, local_tensor_.sizes(), local_tensor_.scalar_type()); + } +#endif + NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup"); return at::from_blob( reinterpret_cast(remote_ptrs_[rank]), @@ -390,6 +525,13 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { } void* SymmetricTensor::multicastPtr() const { +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + return py_symm_handle_->has_multicast_support() + ? py_symm_handle_->get_multicast_ptr() + : nullptr; + } +#endif NVF_CHECK(is_multicast_setup_, "Multicast not setup"); return mc_ptr_; } @@ -398,7 +540,14 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { if (is_contiguous_view_setup_) { return; } - +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + NVF_ERROR( + false, + "Contiguous view is not yet supported for PyTorch symmetric memory backend. " + "Use native backend for SymmetricContiguousView."); + } +#endif NVF_CHECK( are_remote_tensors_setup_ == true, "Remote tensors must be setup before setupContiguousView"); @@ -462,6 +611,13 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { } at::Tensor SymmetricTensor::getContiguousView() const { +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + NVF_ERROR( + false, + "Contiguous view is not yet supported for PyTorch symmetric memory backend."); + } +#endif NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup"); return contiguous_view_; } @@ -469,6 +625,11 @@ at::Tensor SymmetricTensor::getContiguousView() const { void SymmetricTensor::setupMulticast( int64_t exporter_rank, const std::string& tag) { +#ifdef NVFUSER_DISTRIBUTED + if (py_symm_handle_) { + return; // PyTorch backend: multicast handled by backend if supported + } +#endif #if (CUDA_VERSION >= 13000) if (is_multicast_setup_) { return; diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 5608153e0ce..c928a7d5469 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -10,6 +10,10 @@ #include #include +#ifdef NVFUSER_DISTRIBUTED +#include +#endif + namespace nvfuser { // SymmetricTensor wraps a local symmetric memory allocation and enables: @@ -18,13 +22,14 @@ namespace nvfuser { // - Contiguous view creation across all ranks // // Design: Decouples local allocation from IPC handle exchange for better -// interoperability and support for pre-allocated user buffers +// interoperability and support for pre-allocated user buffers. // -// TODO: Long term plan is to integrate pytorch's native symmetric memory as a -// possible backend. One important reason to use pytorch's allocator is to use -// pytorch's memory pool to let the framework own the memory stack and not -// further fragment the memory. On the other hand, having our own implementation -// allows us to experiment more advanced features like contigous view creation. +// Backends (see SymmetricMemoryBackend in ipc_utils.h): +// - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained. +// - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory +// (torch.distributed._symmetric_memory) with the chosen transport backend. +// Select via NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda). +// Native remains the default when the option is not set. class SymmetricTensor { public: // Wrap pre-allocated symmetric tensor (must use allocate()) @@ -79,6 +84,10 @@ class SymmetricTensor { int peer_fd_{-1}; bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; +#ifdef NVFUSER_DISTRIBUTED + // When set, remote/multicast APIs delegate to PyTorch symmetric memory. + c10::intrusive_ptr py_symm_handle_; +#endif }; } // namespace nvfuser diff --git a/csrc/options.cpp b/csrc/options.cpp index 6d587e35afd..9197b43ee3e 100644 --- a/csrc/options.cpp +++ b/csrc/options.cpp @@ -183,6 +183,7 @@ const std::unordered_map& getEnableOptions() { {"fast_math", EnableOption::FastMath}, {"p2p_protocol", EnableOption::P2pProtocol}, {"multicast_protocol", EnableOption::MulticastProtocol}, + {"symmetric_memory_backend", EnableOption::SymmetricMemoryBackend}, {"parallel_serde", EnableOption::ParallelSerde}, }; return available_options; diff --git a/csrc/options.h b/csrc/options.h index 4c72c757460..bda66b1f526 100644 --- a/csrc/options.h +++ b/csrc/options.h @@ -128,6 +128,8 @@ enum class EnableOption { P2pProtocol, //! Prescribe P2P protocol: put|get MulticastProtocol, //! Prescribe multicast protocol: //! memcpy|multimem|batch_memcpy + SymmetricMemoryBackend, //! Prescribe symmetric memory backend: + //! native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda ParallelSerde, //! Enable deserializing FusionExecutorCache in parallel EndOfOption //! Placeholder for counting the number of elements }; diff --git a/fbuild.sh b/fbuild.sh new file mode 100755 index 00000000000..e16a2e2cdd9 --- /dev/null +++ b/fbuild.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +export CC=clang-20 +export CXX=clang++-20 +export LDFLAGS="-fuse-ld=mold" + +export NVFUSER_BUILD_ENABLE_PCH + +export UCC_HOME="/opt/hpcx/ucc" +export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" +export UCX_HOME="/opt/hpcx/ucx" +export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" + +# export TORCH_CUDA_ARCH_LIST="9.0" + +export NVFUSER_BUILD_WITH_UCC=1 +export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser +export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY + +# Enable debug mode, leave empty for non-debug compilation +export NVFUSER_BUILD_BUILD_TYPE=Debug +export RUN_CMAKE="" + +pip install -v -e ./python --no-build-isolation diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp index 2e4b5e66767..98af2bd6fbd 100644 --- a/tests/cpp/test_multidevice_symmetric_tensor.cpp +++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp @@ -5,6 +5,7 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include "multidevice/ipc_utils.h" #include "multidevice/symmetric_tensor.h" #include "tests/cpp/multidevice.h" @@ -12,6 +13,68 @@ namespace nvfuser { using SymmetricTensorTest = MultiDeviceTest; +// ----------------------------------------------------------------------------- +// Symmetric memory backend and option tests +// ----------------------------------------------------------------------------- + +TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) { + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + EXPECT_TRUE( + backend == SymmetricMemoryBackend::Native || + backend == SymmetricMemoryBackend::PyTorchNccl || + backend == SymmetricMemoryBackend::PyTorchNvshmem || + backend == SymmetricMemoryBackend::PyTorchCuda) + << "getSymmetricMemoryBackend() returned an invalid backend value"; +} + +TEST_F(SymmetricTensorTest, Validate_RejectsNormalCudaTensor) { + if (communicator_->size() == 1) { + GTEST_SKIP() << "Skipping test for single device (Native allocate needs VMM)"; + } + // With Native backend, allocate() uses VMM; with PyTorch backend we skip + // because validate() is only used for Native path (PyTorch tensors come from + // cache in constructor). + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + GTEST_SKIP() << "Validate test applies to Native backend only"; + } + // Allocate a normal (non-symmetric) CUDA tensor + at::Tensor normal_tensor = at::empty( + {64, 64}, + at::TensorOptions().dtype(at::kFloat).device(communicator_->device())); + std::string error = SymmetricTensor::validate(normal_tensor); + EXPECT_FALSE(error.empty()) + << "SymmetricTensor::validate() should reject a normal CUDA tensor"; +} + +TEST_F(SymmetricTensorTest, Validate_AcceptsSymmetricAllocation) { + if (communicator_->size() == 1) { + GTEST_SKIP() << "Skipping test for single device"; + } + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + GTEST_SKIP() << "validate() for allocate() output is defined for Native backend only"; + } + at::Tensor sym_tensor = SymmetricTensor::allocate( + {128, 128}, at::ScalarType::Float, communicator_->device()); + std::string error = SymmetricTensor::validate(sym_tensor); + EXPECT_TRUE(error.empty()) + << "SymmetricTensor::validate() should accept tensor from allocate(); got: " + << error; +} + +TEST_F(SymmetricTensorTest, Constructor_ThrowsOnInvalidTensor) { + at::Tensor normal_tensor = at::empty( + {8, 8}, + at::TensorOptions().dtype(at::kFloat).device(communicator_->device())); + EXPECT_THROW( + { SymmetricTensor sym_tensor(normal_tensor); }, + c10::Error) + << "SymmetricTensor constructor should throw when given a non-symmetric tensor"; +} + +// ----------------------------------------------------------------------------- +// Backend-agnostic and Native backend correctness (allocate + remote access) +// ----------------------------------------------------------------------------- + TEST_F(SymmetricTensorTest, BasicAllocation) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; @@ -54,6 +117,51 @@ TEST_F(SymmetricTensorTest, BasicAllocation) { } } +// Same remote-access correctness as BasicAllocation but only runs when +// PyTorch symmetric memory backend is selected (NVFUSER_ENABLE= +// symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda)). +// Run with e.g. NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to +// exercise the PyTorch path. +TEST_F(SymmetricTensorTest, PyTorchBackend_RemoteAccessCorrectness) { + if (communicator_->size() == 1) { + GTEST_SKIP() << "Skipping test for single device"; + } + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + if (backend == SymmetricMemoryBackend::Native) { + GTEST_SKIP() + << "PyTorch backend not selected; set NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to run"; + } + + const int64_t rank = communicator_->deviceId(); + const int64_t world_size = communicator_->size(); + + at::Tensor local_tensor = SymmetricTensor::allocate( + {256, 512}, at::ScalarType::Float, communicator_->device()); + SymmetricTensor sym_tensor(local_tensor); + + EXPECT_TRUE(local_tensor.is_cuda()); + EXPECT_EQ(local_tensor.numel(), 256 * 512); + + float local_value = static_cast(rank + 200); + local_tensor.fill_(local_value); + + sym_tensor.setupRemoteHandles(); + + for (int64_t peer_rank = 0; peer_rank < world_size; ++peer_rank) { + void* peer_ptr = sym_tensor.remoteTensor(peer_rank).data_ptr(); + EXPECT_NE(peer_ptr, nullptr); + + float peer_value; + NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( + &peer_value, peer_ptr, sizeof(float), cudaMemcpyDeviceToHost)); + + float expected_value = static_cast(peer_rank + 200); + EXPECT_FLOAT_EQ(peer_value, expected_value) + << "Rank " << rank << " reading from rank " << peer_rank + << " (PyTorch backend)"; + } +} + TEST_F(SymmetricTensorTest, PreallocatedTensor) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; From 49d669c3b237a1b78949371f3f9133c6bc9167ec Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Mon, 9 Mar 2026 11:19:58 +0200 Subject: [PATCH 02/16] Initial review comments --- csrc/multidevice/communicator.cpp | 1 + csrc/multidevice/symmetric_tensor.cpp | 71 +++++++------------ .../cpp/test_multidevice_symmetric_tensor.cpp | 48 ------------- 3 files changed, 25 insertions(+), 95 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 7531f7ff186..85fc15ae3a5 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -441,3 +441,4 @@ c10::intrusive_ptr Communicator::getWorldBackendIntrusivePtr( #endif } // namespace nvfuser + diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 6f8c3330515..b3e1ca49c8e 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -7,7 +7,6 @@ // clang-format on #include "multidevice/symmetric_tensor.h" -#include #include #include "cuda_utils.h" @@ -18,6 +17,7 @@ #ifdef NVFUSER_DISTRIBUTED #include +#include #endif namespace nvfuser { @@ -27,22 +27,6 @@ namespace { #ifdef NVFUSER_DISTRIBUTED const char* kPyTorchSymmMemGroupName = "nvfuser_symm"; -// Cache: tensor storage data ptr -> PyTorch SymmetricMemory handle from -// rendezvous. Used so SymmetricTensor(tensor) can recover the handle. -std::unordered_map>& -getPySymmHandleCache() { - static std::unordered_map< - void*, - c10::intrusive_ptr> - cache; - return cache; -} - -std::mutex& getPySymmHandleCacheMutex() { - static std::mutex m; - return m; -} - void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { static std::once_flag once; std::call_once(once, [backend]() { @@ -68,6 +52,9 @@ void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { static_cast(comm.deviceId()), static_cast(comm.size()), comm.getStore()); + // c10d::register_process_group( + // kPyTorchSymmMemGroupName, + // comm.getWorldBackendIntrusivePtr(CommunicatorBackend::kNccl)); }); } #endif @@ -157,22 +144,13 @@ at::Tensor SymmetricTensor::allocate( (backend == SymmetricMemoryBackend::PyTorchNccl) ? c10::nullopt : c10::optional(kPyTorchSymmMemGroupName); - at::Tensor tensor = c10d::symmetric_memory::empty_strided_p2p( + return c10d::symmetric_memory::empty_strided_p2p( sizes, strides, dtype, device, alloc_group_name, c10::nullopt); - c10::intrusive_ptr handle = - c10d::symmetric_memory::rendezvous( - tensor, c10::optional(kPyTorchSymmMemGroupName)); - void* key = tensor.storage().data_ptr().get(); - { - std::lock_guard lock(getPySymmHandleCacheMutex()); - getPySymmHandleCache()[key] = handle; - } - return tensor; } #else if (backend != SymmetricMemoryBackend::Native) { @@ -309,24 +287,14 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) local_tensor.device()); #ifdef NVFUSER_DISTRIBUTED - { - void* key = local_tensor.storage().data_ptr().get(); - std::lock_guard lock(getPySymmHandleCacheMutex()); - auto& cache = getPySymmHandleCache(); - auto it = cache.find(key); - if (it != cache.end()) { - py_symm_handle_ = std::move(it->second); - cache.erase(it); - world_size_ = py_symm_handle_->get_world_size(); - my_device_id_ = py_symm_handle_->get_rank(); - requested_size_ = local_tensor.numel() * local_tensor.element_size(); - are_remote_tensors_setup_ = true; // PyTorch rendezvous already set up - if (py_symm_handle_->has_multicast_support()) { - is_multicast_setup_ = true; - mc_ptr_ = py_symm_handle_->get_multicast_ptr(); - } - return; - } + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + if (backend != SymmetricMemoryBackend::Native) { + ensurePyTorchSymmMemBackend(backend); + Communicator& comm = Communicator::getInstance(); + world_size_ = comm.size(); + my_device_id_ = comm.deviceId(); + requested_size_ = local_tensor.numel() * local_tensor.element_size(); + return; // Rendezvous runs in setupRemoteHandles() } #endif @@ -426,8 +394,17 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { return; } #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { - return; // PyTorch backend: rendezvous already established remote access + // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles). + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + ensurePyTorchSymmMemBackend(getSymmetricMemoryBackend()); + py_symm_handle_ = c10d::symmetric_memory::rendezvous( + local_tensor_, c10::optional(kPyTorchSymmMemGroupName)); + are_remote_tensors_setup_ = true; + if (py_symm_handle_->has_multicast_support()) { + is_multicast_setup_ = true; + mc_ptr_ = py_symm_handle_->get_multicast_ptr(); + } + return; } #endif Communicator& comm = Communicator::getInstance(); diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp index afb895ca938..19e213e6ab9 100644 --- a/tests/cpp/test_multidevice_symmetric_tensor.cpp +++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp @@ -27,54 +27,6 @@ TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) { << "getSymmetricMemoryBackend() returned an invalid backend value"; } -TEST_F(SymmetricTensorTest, Validate_RejectsNormalCudaTensor) { - if (communicator_->size() == 1) { - GTEST_SKIP() << "Skipping test for single device (Native allocate needs VMM)"; - } - // With Native backend, allocate() uses VMM; with PyTorch backend we skip - // because validate() is only used for Native path (PyTorch tensors come from - // cache in constructor). - if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { - GTEST_SKIP() << "Validate test applies to Native backend only"; - } - // Allocate a normal (non-symmetric) CUDA tensor - at::Tensor normal_tensor = at::empty( - {64, 64}, - at::TensorOptions().dtype(at::kFloat).device(communicator_->device())); - std::string error = SymmetricTensor::validate(normal_tensor); - EXPECT_FALSE(error.empty()) - << "SymmetricTensor::validate() should reject a normal CUDA tensor"; -} - -TEST_F(SymmetricTensorTest, Validate_AcceptsSymmetricAllocation) { - if (communicator_->size() == 1) { - GTEST_SKIP() << "Skipping test for single device"; - } - if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { - GTEST_SKIP() << "validate() for allocate() output is defined for Native backend only"; - } - at::Tensor sym_tensor = SymmetricTensor::allocate( - {128, 128}, at::ScalarType::Float, communicator_->device()); - std::string error = SymmetricTensor::validate(sym_tensor); - EXPECT_TRUE(error.empty()) - << "SymmetricTensor::validate() should accept tensor from allocate(); got: " - << error; -} - -TEST_F(SymmetricTensorTest, Constructor_ThrowsOnInvalidTensor) { - at::Tensor normal_tensor = at::empty( - {8, 8}, - at::TensorOptions().dtype(at::kFloat).device(communicator_->device())); - EXPECT_THROW( - { SymmetricTensor sym_tensor(normal_tensor); }, - c10::Error) - << "SymmetricTensor constructor should throw when given a non-symmetric tensor"; -} - -// ----------------------------------------------------------------------------- -// Backend-agnostic and Native backend correctness (allocate + remote access) -// ----------------------------------------------------------------------------- - TEST_F(SymmetricTensorTest, BasicAllocation) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; From 8962475d15c45baec352b30f820d9c47955b9b88 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Mon, 16 Mar 2026 13:55:37 +0200 Subject: [PATCH 03/16] Alloc, rendezvous passing --- csrc/multidevice/communicator.cpp | 55 +++++-- csrc/multidevice/communicator.h | 35 +++-- csrc/multidevice/symmetric_tensor.cpp | 144 +++++++++++------- csrc/multidevice/symmetric_tensor.h | 5 +- .../cpp/test_multidevice_symmetric_tensor.cpp | 76 +++------ 5 files changed, 177 insertions(+), 138 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 85fc15ae3a5..b0642dc3f7c 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -14,7 +14,13 @@ #include #ifdef NVFUSER_DISTRIBUTED +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP +#include +#endif #include +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP +#include +#endif #include #ifdef USE_C10D_NCCL #include @@ -362,6 +368,12 @@ void Communicator::cleanup() { pg_nccl->shutdown(); } } +#endif +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP + for (const auto& entry : process_groups_) { + c10d::unregister_process_group(entry.first); + } + process_groups_.clear(); #endif backends_.clear(); } @@ -402,6 +414,28 @@ c10d::Backend* Communicator::getBackendForTeam( }(); #else backends_[team_key] = nullptr; +#endif +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP + std::optional pg_backend = + (b == CommunicatorBackend::kNccl) + ? std::optional( + c10d::ProcessGroup::BackendType::NCCL) + : std::nullopt; + if (backends_[team_key] != nullptr && pg_backend.has_value()) { + auto rank_it = std::find(team.begin(), team.end(), deviceId()); + RankType team_rank = std::distance(team.begin(), rank_it); + + auto pg = c10::make_intrusive( + c10::make_intrusive(team_key, store_), + team_rank, + static_cast(team.size())); + pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]); + pg->setDefaultBackend(*pg_backend); + pg->setGroupName(team_key); + + c10d::register_process_group(team_key, pg); + process_groups_[team_key] = std::move(pg); + } #endif } return backends_.at(team_key).get(); @@ -424,21 +458,14 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } -#ifdef NVFUSER_DISTRIBUTED -c10::intrusive_ptr Communicator::getStore() const { - return c10::intrusive_ptr(store_); -} - -c10::intrusive_ptr Communicator::getWorldBackendIntrusivePtr( - std::optional backend) { - std::vector all_ranks(size_); - std::iota(all_ranks.begin(), all_ranks.end(), 0); - CommunicatorBackend b = backend.value_or(default_backend_); - std::string team_key = getTeamKey(all_ranks, b); - (void)getBackendForTeam(all_ranks, backend, ""); - return backends_.at(team_key); +std::string Communicator::getSymmMemGroupKey( + std::optional backend) { +std::vector all_ranks(size_); +std::iota(all_ranks.begin(), all_ranks.end(), 0); +CommunicatorBackend b = backend.value_or(default_backend_); +(void)getBackendForTeam(all_ranks, b, "symm_mem_"); +return getTeamKey(all_ranks, b); } -#endif } // namespace nvfuser diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 45a7f9e4f4f..57d3d48a750 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -11,9 +11,19 @@ #include #include +#if defined(NVFUSER_DISTRIBUTED) && \ + __has_include() && \ + __has_include() +#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 1 +#else +#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 0 +#endif + #ifdef NVFUSER_DISTRIBUTED #include -#include +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP +#include +#endif #include #include #else @@ -111,6 +121,10 @@ class NVF_API Communicator { c10d::Backend* getWorld( std::optional backend = std::nullopt); + // Returns the world process-group name for the given backend. + std::string getSymmMemGroupKey( + std::optional backend = std::nullopt); + // returns if a backend is available for creation bool isBackendAvailable(CommunicatorBackend backend) const { if (backend == CommunicatorBackend::kUcc) { @@ -125,17 +139,9 @@ class NVF_API Communicator { return store_.get(); } -#ifdef NVFUSER_DISTRIBUTED - // Returns the store as an intrusive_ptr for use with PyTorch symmetric - // memory (c10d::symmetric_memory::set_group_info). - c10::intrusive_ptr getStore() const; - - // Returns the world backend as an intrusive_ptr so it can be registered with - // c10d::register_process_group (e.g. for PyTorch symmetric memory NCCL - // rendezvous, which resolves the group by name). - c10::intrusive_ptr getWorldBackendIntrusivePtr( - std::optional backend = std::nullopt); -#endif + c10::intrusive_ptr getStore() const { + return c10::intrusive_ptr(store_); + } private: Communicator( @@ -166,6 +172,11 @@ class NVF_API Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; +#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP + // c10d process-group wrappers registered for symmetric-memory rendezvous. + std::unordered_map> + process_groups_; +#endif }; } // namespace nvfuser diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index b3e1ca49c8e..7ca46e7a290 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -25,39 +25,66 @@ namespace nvfuser { namespace { #ifdef NVFUSER_DISTRIBUTED -const char* kPyTorchSymmMemGroupName = "nvfuser_symm"; - -void ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { - static std::once_flag once; - std::call_once(once, [backend]() { - const char* name = nullptr; - switch (backend) { - case SymmetricMemoryBackend::PyTorchNccl: - name = "NCCL"; - break; - case SymmetricMemoryBackend::PyTorchNvshmem: - name = "NVSHMEM"; - break; - case SymmetricMemoryBackend::PyTorchCuda: - name = "CUDA"; - break; - default: - NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); - } - c10d::symmetric_memory::set_backend(name); + std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { + static std::once_flag once; + std::call_once(once, [backend]() { + const char* name = nullptr; + switch (backend) { + case SymmetricMemoryBackend::PyTorchNccl: + name = "NCCL"; + break; + case SymmetricMemoryBackend::PyTorchNvshmem: + name = "NVSHMEM"; + break; + case SymmetricMemoryBackend::PyTorchCuda: + name = "CUDA"; + break; + default: + NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); + } + c10d::symmetric_memory::set_backend(name); + }); + Communicator& comm = Communicator::getInstance(); NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory"); - c10d::symmetric_memory::set_group_info( - kPyTorchSymmMemGroupName, - static_cast(comm.deviceId()), - static_cast(comm.size()), - comm.getStore()); - // c10d::register_process_group( - // kPyTorchSymmMemGroupName, - // comm.getWorldBackendIntrusivePtr(CommunicatorBackend::kNccl)); - }); -} -#endif + // TODO: Remove after nccl version update + auto maybe_set_group_info = [&](const std::string& group_name) { + try { + c10d::symmetric_memory::set_group_info( + group_name, + static_cast(comm.deviceId()), + static_cast(comm.size()), + comm.getStore()); + } catch (const c10::Error&) { + // already registered + } + }; + // Always return a valid group name + if (backend == SymmetricMemoryBackend::PyTorchNccl) { + NVF_CHECK( + comm.isBackendAvailable(CommunicatorBackend::kNccl), + "NCCL backend is required for symmetric_memory_backend(nccl)"); + + const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); + + // This build expects default group name "0" to be registered. + // TODO: Remove after nccl version update + static std::once_flag group_once; + std::call_once(group_once, [&]() { + maybe_set_group_info("0"); + maybe_set_group_info(group_name); + }); + + comm.barrier(CommunicatorBackend::kNccl); + return group_name; + } + + NVF_ERROR( + false, + "No c10d backend available for symmetric memory rendezvous. " + "Expected NCCL or UCC process group."); + } +#endif // Returns the allocation granularity for symmetric memory. @@ -133,7 +160,7 @@ at::Tensor SymmetricTensor::allocate( #ifdef NVFUSER_DISTRIBUTED if (backend != SymmetricMemoryBackend::Native) { - ensurePyTorchSymmMemBackend(backend); + const std::string group_name = ensurePyTorchSymmMemBackend(backend); std::vector strides(sizes.size()); strides.back() = 1; for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) { @@ -143,7 +170,7 @@ at::Tensor SymmetricTensor::allocate( c10::optional alloc_group_name = (backend == SymmetricMemoryBackend::PyTorchNccl) ? c10::nullopt - : c10::optional(kPyTorchSymmMemGroupName); + : c10::optional(group_name); return c10d::symmetric_memory::empty_strided_p2p( sizes, strides, @@ -289,12 +316,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) #ifdef NVFUSER_DISTRIBUTED SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { - ensurePyTorchSymmMemBackend(backend); Communicator& comm = Communicator::getInstance(); world_size_ = comm.size(); my_device_id_ = comm.deviceId(); requested_size_ = local_tensor.numel() * local_tensor.element_size(); - return; // Rendezvous runs in setupRemoteHandles() + return; } #endif @@ -340,7 +366,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) SymmetricTensor::~SymmetricTensor() { #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { + if (torch_symm_handle_) { return; // PyTorch backend: no native VMM cleanup } #endif @@ -395,14 +421,15 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { } #ifdef NVFUSER_DISTRIBUTED // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles). - if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { - ensurePyTorchSymmMemBackend(getSymmetricMemoryBackend()); - py_symm_handle_ = c10d::symmetric_memory::rendezvous( - local_tensor_, c10::optional(kPyTorchSymmMemGroupName)); + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + if (backend != SymmetricMemoryBackend::Native) { + const std::string group_name = ensurePyTorchSymmMemBackend(backend); + torch_symm_handle_ = c10d::symmetric_memory::rendezvous( + local_tensor_, group_name); are_remote_tensors_setup_ = true; - if (py_symm_handle_->has_multicast_support()) { + if (torch_symm_handle_->has_multicast_support()) { is_multicast_setup_ = true; - mc_ptr_ = py_symm_handle_->get_multicast_ptr(); + mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); } return; } @@ -485,9 +512,24 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { } #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { - return py_symm_handle_->get_remote_tensor( - rank, local_tensor_.sizes(), local_tensor_.scalar_type()); + if (torch_symm_handle_) { + return torch_symm_handle_->get_remote_tensor( + rank, local_tensor_.sizes(), local_tensor_.scalar_type()); + // Tried below code for older NCCL build, but it was not working. + // const int64_t storage_offset = local_tensor_.storage_offset(); + // auto* handle = torch_symm_handle_.get(); + // auto peer_ptr = handle->get_buffer_ptrs(); + // NVF_CHECK( + // peer_ptr != nullptr, + // "Cannot get buffer across nodes, my rank: ", + // handle->get_rank(), + // ", peer: ", + // rank); + // std::cout << "symm_mem type: " << typeid(*handle).name() << std::endl; + // const int ws = handle->get_world_size(); + // NVF_CHECK(ws > 0, "SymmetricMemory world_size is 0"); + // return handle->get_buffer( + // rank, local_tensor_.sizes(), local_tensor_.scalar_type(), storage_offset); } #endif @@ -503,9 +545,9 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { void* SymmetricTensor::multicastPtr() const { #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { - return py_symm_handle_->has_multicast_support() - ? py_symm_handle_->get_multicast_ptr() + if (torch_symm_handle_) { + return torch_symm_handle_->has_multicast_support() + ? torch_symm_handle_->get_multicast_ptr() : nullptr; } #endif @@ -518,7 +560,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { return; } #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { + if (torch_symm_handle_) { NVF_ERROR( false, "Contiguous view is not yet supported for PyTorch symmetric memory backend. " @@ -589,7 +631,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { at::Tensor SymmetricTensor::getContiguousView() const { #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { + if (torch_symm_handle_) { NVF_ERROR( false, "Contiguous view is not yet supported for PyTorch symmetric memory backend."); @@ -603,7 +645,7 @@ void SymmetricTensor::setupMulticast( int64_t exporter_rank, const std::string& tag) { #ifdef NVFUSER_DISTRIBUTED - if (py_symm_handle_) { + if (torch_symm_handle_) { return; // PyTorch backend: multicast handled by backend if supported } #endif diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index c928a7d5469..eedf3602846 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -28,7 +28,7 @@ namespace nvfuser { // - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained. // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory // (torch.distributed._symmetric_memory) with the chosen transport backend. -// Select via NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda). +// Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda). // Native remains the default when the option is not set. class SymmetricTensor { public: @@ -85,8 +85,7 @@ class SymmetricTensor { bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; #ifdef NVFUSER_DISTRIBUTED - // When set, remote/multicast APIs delegate to PyTorch symmetric memory. - c10::intrusive_ptr py_symm_handle_; + c10::intrusive_ptr torch_symm_handle_; #endif }; diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp index 19e213e6ab9..397189541ad 100644 --- a/tests/cpp/test_multidevice_symmetric_tensor.cpp +++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp @@ -13,20 +13,6 @@ namespace nvfuser { using SymmetricTensorTest = MultiDeviceTest; -// ----------------------------------------------------------------------------- -// Symmetric memory backend and option tests -// ----------------------------------------------------------------------------- - -TEST_F(SymmetricTensorTest, GetSymmetricMemoryBackend_ReturnsValidBackend) { - SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); - EXPECT_TRUE( - backend == SymmetricMemoryBackend::Native || - backend == SymmetricMemoryBackend::PyTorchNccl || - backend == SymmetricMemoryBackend::PyTorchNvshmem || - backend == SymmetricMemoryBackend::PyTorchCuda) - << "getSymmetricMemoryBackend() returned an invalid backend value"; -} - TEST_F(SymmetricTensorTest, BasicAllocation) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; @@ -69,50 +55,24 @@ TEST_F(SymmetricTensorTest, BasicAllocation) { } } -// Same remote-access correctness as BasicAllocation but only runs when -// PyTorch symmetric memory backend is selected (NVFUSER_ENABLE= -// symmetric_memory_backend(pytorch_nccl|pytorch_nvshmem|pytorch_cuda)). -// Run with e.g. NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to -// exercise the PyTorch path. -// TEST_F(SymmetricTensorTest, PyTorchBackend_RemoteAccessCorrectness) { -// if (communicator_->size() == 1) { -// GTEST_SKIP() << "Skipping test for single device"; -// } -// SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); -// if (backend == SymmetricMemoryBackend::Native) { -// GTEST_SKIP() -// << "PyTorch backend not selected; set NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl) to run"; -// } - -// const int64_t rank = communicator_->deviceId(); -// const int64_t world_size = communicator_->size(); - -// at::Tensor local_tensor = SymmetricTensor::allocate( -// {256, 512}, at::ScalarType::Float, communicator_->device()); -// SymmetricTensor sym_tensor(local_tensor); - -// EXPECT_TRUE(local_tensor.is_cuda()); -// EXPECT_EQ(local_tensor.numel(), 256 * 512); - -// float local_value = static_cast(rank + 200); -// local_tensor.fill_(local_value); - -// sym_tensor.setupRemoteHandles(); - -// for (int64_t peer_rank = 0; peer_rank < world_size; ++peer_rank) { -// void* peer_ptr = sym_tensor.remoteTensor(peer_rank).data_ptr(); -// EXPECT_NE(peer_ptr, nullptr); - -// float peer_value; -// NVFUSER_CUDA_RT_SAFE_CALL(cudaMemcpy( -// &peer_value, peer_ptr, sizeof(float), cudaMemcpyDeviceToHost)); - -// float expected_value = static_cast(peer_rank + 200); -// EXPECT_FLOAT_EQ(peer_value, expected_value) -// << "Rank " << rank << " reading from rank " << peer_rank -// << " (PyTorch backend)"; -// } -// } +TEST_F(SymmetricTensorTest, AllocateOnly) { + if (communicator_->size() == 1) { + GTEST_SKIP() << "Skipping test for single device"; + } + + at::Tensor local_tensor = SymmetricTensor::allocate( + {64, 128}, at::ScalarType::Float, communicator_->device()); + + EXPECT_TRUE(local_tensor.is_cuda()); + EXPECT_EQ(local_tensor.scalar_type(), at::ScalarType::Float); + EXPECT_EQ(local_tensor.numel(), 64 * 128); + EXPECT_EQ(local_tensor.sizes()[0], 64); + EXPECT_EQ(local_tensor.sizes()[1], 128); + + SymmetricTensor sym_tensor(local_tensor); + EXPECT_EQ(sym_tensor.localTensor().numel(), 64 * 128); + EXPECT_EQ(sym_tensor.localTensor().data_ptr(), local_tensor.data_ptr()); +} TEST_F(SymmetricTensorTest, PreallocatedTensor) { if (communicator_->size() == 1) { From 67181c89fdee938d2d597ed8f18cbf4eb073e8c3 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Tue, 17 Mar 2026 17:30:21 +0200 Subject: [PATCH 04/16] multicast pending --- 1 | 25 ++++++++++++ csrc/multidevice/communicator.cpp | 2 +- csrc/multidevice/communicator.h | 4 -- csrc/multidevice/symmetric_tensor.cpp | 58 ++++++--------------------- fbuild.sh | 1 + 5 files changed, 40 insertions(+), 50 deletions(-) create mode 100644 1 diff --git a/1 b/1 new file mode 100644 index 00000000000..de4cad83241 --- /dev/null +++ b/1 @@ -0,0 +1,25 @@ +#!/bin/bash + +export CC=clang-20 +export CXX=clang++-20 +export CUDAHOSTCXX=/usr/bin/clang++-20 +export LDFLAGS="-fuse-ld=mold" + +export NVFUSER_BUILD_ENABLE_PCH + +export UCC_HOME="/opt/hpcx/ucc" +export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" +export UCX_HOME="/opt/hpcx/ucx" +export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" + +# export TORCH_CUDA_ARCH_LIST="9.0" + +export NVFUSER_BUILD_WITH_UCC=1 +export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser +export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY + +# Enable debug mode, leave empty for non-debug compilation +export NVFUSER_BUILD_BUILD_TYPE=Debug +export RUN_CMAKE="" + +pip install -v -e ./python --no-build-isolation diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index b0642dc3f7c..c1dccfe734e 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -463,7 +463,7 @@ std::string Communicator::getSymmMemGroupKey( std::vector all_ranks(size_); std::iota(all_ranks.begin(), all_ranks.end(), 0); CommunicatorBackend b = backend.value_or(default_backend_); -(void)getBackendForTeam(all_ranks, b, "symm_mem_"); +(void)getBackendForTeam(all_ranks, b); return getTeamKey(all_ranks, b); } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 57d3d48a750..58ccc2b153a 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -139,10 +139,6 @@ class NVF_API Communicator { return store_.get(); } - c10::intrusive_ptr getStore() const { - return c10::intrusive_ptr(store_); - } - private: Communicator( CommunicatorBackend backend = comm_backend_default, diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 7ca46e7a290..3a6cbf1d7a0 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -47,18 +47,7 @@ namespace { Communicator& comm = Communicator::getInstance(); NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory"); - // TODO: Remove after nccl version update - auto maybe_set_group_info = [&](const std::string& group_name) { - try { - c10d::symmetric_memory::set_group_info( - group_name, - static_cast(comm.deviceId()), - static_cast(comm.size()), - comm.getStore()); - } catch (const c10::Error&) { - // already registered - } - }; + // Always return a valid group name if (backend == SymmetricMemoryBackend::PyTorchNccl) { NVF_CHECK( @@ -67,14 +56,6 @@ namespace { const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); - // This build expects default group name "0" to be registered. - // TODO: Remove after nccl version update - static std::once_flag group_once; - std::call_once(group_once, [&]() { - maybe_set_group_info("0"); - maybe_set_group_info(group_name); - }); - comm.barrier(CommunicatorBackend::kNccl); return group_name; } @@ -421,17 +402,19 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { } #ifdef NVFUSER_DISTRIBUTED // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles). - SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); - if (backend != SymmetricMemoryBackend::Native) { - const std::string group_name = ensurePyTorchSymmMemBackend(backend); - torch_symm_handle_ = c10d::symmetric_memory::rendezvous( - local_tensor_, group_name); - are_remote_tensors_setup_ = true; - if (torch_symm_handle_->has_multicast_support()) { - is_multicast_setup_ = true; - mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); + if(is_multicast_setup_==false) { + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + if (backend != SymmetricMemoryBackend::Native) { + const std::string group_name = ensurePyTorchSymmMemBackend(backend); + torch_symm_handle_ = c10d::symmetric_memory::rendezvous( + local_tensor_, group_name); + are_remote_tensors_setup_ = true; + if (torch_symm_handle_->has_multicast_support()) { + is_multicast_setup_ = true; + mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); + } + return; } - return; } #endif Communicator& comm = Communicator::getInstance(); @@ -515,21 +498,6 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( rank, local_tensor_.sizes(), local_tensor_.scalar_type()); - // Tried below code for older NCCL build, but it was not working. - // const int64_t storage_offset = local_tensor_.storage_offset(); - // auto* handle = torch_symm_handle_.get(); - // auto peer_ptr = handle->get_buffer_ptrs(); - // NVF_CHECK( - // peer_ptr != nullptr, - // "Cannot get buffer across nodes, my rank: ", - // handle->get_rank(), - // ", peer: ", - // rank); - // std::cout << "symm_mem type: " << typeid(*handle).name() << std::endl; - // const int ws = handle->get_world_size(); - // NVF_CHECK(ws > 0, "SymmetricMemory world_size is 0"); - // return handle->get_buffer( - // rank, local_tensor_.sizes(), local_tensor_.scalar_type(), storage_offset); } #endif diff --git a/fbuild.sh b/fbuild.sh index e16a2e2cdd9..de4cad83241 100755 --- a/fbuild.sh +++ b/fbuild.sh @@ -2,6 +2,7 @@ export CC=clang-20 export CXX=clang++-20 +export CUDAHOSTCXX=/usr/bin/clang++-20 export LDFLAGS="-fuse-ld=mold" export NVFUSER_BUILD_ENABLE_PCH From eea57d807228479f6e3129fef5bbaeb0a03524c7 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 20 Mar 2026 02:11:35 +0200 Subject: [PATCH 05/16] all backends passing --- csrc/multidevice/communicator.cpp | 3 +- csrc/multidevice/symmetric_tensor.cpp | 78 ++++++++++--------- csrc/multidevice/symmetric_tensor.h | 2 +- .../cpp/test_multidevice_symmetric_tensor.cpp | 25 ++---- 4 files changed, 50 insertions(+), 58 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index c1dccfe734e..ba40835d9da 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -467,5 +467,4 @@ CommunicatorBackend b = backend.value_or(default_backend_); return getTeamKey(all_ranks, b); } -} // namespace nvfuser - +} // namespace nvfuser \ No newline at end of file diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 3a6cbf1d7a0..b53bfbe2235 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -32,9 +32,11 @@ namespace { switch (backend) { case SymmetricMemoryBackend::PyTorchNccl: name = "NCCL"; + c10d::symmetric_memory::set_backend(name); break; case SymmetricMemoryBackend::PyTorchNvshmem: name = "NVSHMEM"; + c10d::symmetric_memory::set_backend(name); break; case SymmetricMemoryBackend::PyTorchCuda: name = "CUDA"; @@ -42,30 +44,39 @@ namespace { default: NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); } - c10d::symmetric_memory::set_backend(name); }); - + Communicator& comm = Communicator::getInstance(); NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory"); // Always return a valid group name - if (backend == SymmetricMemoryBackend::PyTorchNccl) { + if (backend != SymmetricMemoryBackend::Native) { NVF_CHECK( comm.isBackendAvailable(CommunicatorBackend::kNccl), "NCCL backend is required for symmetric_memory_backend(nccl)"); - + const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); + static std::once_flag pg0_once; + std::call_once(pg0_once, [&]() { + try { + (void)c10d::resolve_process_group("0"); + } catch (const c10::Error&) { + auto pg = c10d::resolve_process_group(group_name); + c10d::register_process_group("0", pg); + } + }); + comm.barrier(CommunicatorBackend::kNccl); return group_name; } - + NVF_ERROR( false, "No c10d backend available for symmetric memory rendezvous. " "Expected NCCL or UCC process group."); } -#endif +#endif // Returns the allocation granularity for symmetric memory. @@ -149,7 +160,7 @@ at::Tensor SymmetricTensor::allocate( } // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name. c10::optional alloc_group_name = - (backend == SymmetricMemoryBackend::PyTorchNccl) + (backend == SymmetricMemoryBackend::PyTorchNccl || backend == SymmetricMemoryBackend::PyTorchNvshmem) ? c10::nullopt : c10::optional(group_name); return c10d::symmetric_memory::empty_strided_p2p( @@ -158,7 +169,7 @@ at::Tensor SymmetricTensor::allocate( dtype, device, alloc_group_name, - c10::nullopt); + /*alloc_id=*/c10::nullopt); } #else if (backend != SymmetricMemoryBackend::Native) { @@ -226,6 +237,9 @@ at::Tensor SymmetricTensor::allocate( } std::string SymmetricTensor::validate(at::Tensor tensor) { + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + return ""; + } int is_vmm_supported; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( &is_vmm_supported, @@ -402,19 +416,17 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { } #ifdef NVFUSER_DISTRIBUTED // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles). - if(is_multicast_setup_==false) { - SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); - if (backend != SymmetricMemoryBackend::Native) { - const std::string group_name = ensurePyTorchSymmMemBackend(backend); - torch_symm_handle_ = c10d::symmetric_memory::rendezvous( - local_tensor_, group_name); - are_remote_tensors_setup_ = true; - if (torch_symm_handle_->has_multicast_support()) { - is_multicast_setup_ = true; - mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); - } - return; + SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); + if (backend != SymmetricMemoryBackend::Native) { + const std::string group_name = ensurePyTorchSymmMemBackend(backend); + torch_symm_handle_ = c10d::symmetric_memory::rendezvous( + local_tensor_, group_name); + are_remote_tensors_setup_ = true; + if (torch_symm_handle_->has_multicast_support()) { + is_multicast_setup_ = true; + mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); } + return; } #endif Communicator& comm = Communicator::getInstance(); @@ -512,13 +524,6 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { } void* SymmetricTensor::multicastPtr() const { -#ifdef NVFUSER_DISTRIBUTED - if (torch_symm_handle_) { - return torch_symm_handle_->has_multicast_support() - ? torch_symm_handle_->get_multicast_ptr() - : nullptr; - } -#endif NVF_CHECK(is_multicast_setup_, "Multicast not setup"); return mc_ptr_; } @@ -529,7 +534,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { } #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { - NVF_ERROR( + NVF_THROW( false, "Contiguous view is not yet supported for PyTorch symmetric memory backend. " "Use native backend for SymmetricContiguousView."); @@ -600,7 +605,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { at::Tensor SymmetricTensor::getContiguousView() const { #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { - NVF_ERROR( + NVF_THROW( false, "Contiguous view is not yet supported for PyTorch symmetric memory backend."); } @@ -612,16 +617,19 @@ at::Tensor SymmetricTensor::getContiguousView() const { void SymmetricTensor::setupMulticast( int64_t exporter_rank, const std::string& tag) { -#ifdef NVFUSER_DISTRIBUTED - if (torch_symm_handle_) { - return; // PyTorch backend: multicast handled by backend if supported - } -#endif #if (CUDA_VERSION >= 13000) if (is_multicast_setup_) { return; } - +#ifdef NVFUSER_DISTRIBUTED + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + if (!torch_symm_handle_) { + setupRemoteHandles(tag); + NVF_CHECK(torch_symm_handle_->has_multicast_support(), "Multicast not supported"); + } + return; + } +#endif Communicator& comm = Communicator::getInstance(); const int64_t my_rank = comm.deviceId(); const int64_t local_rank = comm.local_rank(); diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index eedf3602846..20f4d63ccda 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -25,7 +25,7 @@ namespace nvfuser { // interoperability and support for pre-allocated user buffers. // // Backends (see SymmetricMemoryBackend in ipc_utils.h): -// - Native (default): Fuser's own CUDA VMM + IPC implementation; maintained. +// - Native (default): Fuser's own CUDA VMM + IPC implementation // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory // (torch.distributed._symmetric_memory) with the chosen transport backend. // Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda). diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp index 397189541ad..020ea0b8745 100644 --- a/tests/cpp/test_multidevice_symmetric_tensor.cpp +++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp @@ -5,9 +5,9 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on -#include "multidevice/ipc_utils.h" #include "multidevice/symmetric_tensor.h" #include "tests/cpp/multidevice.h" +#include "multidevice/ipc_utils.h" namespace nvfuser { @@ -55,25 +55,6 @@ TEST_F(SymmetricTensorTest, BasicAllocation) { } } -TEST_F(SymmetricTensorTest, AllocateOnly) { - if (communicator_->size() == 1) { - GTEST_SKIP() << "Skipping test for single device"; - } - - at::Tensor local_tensor = SymmetricTensor::allocate( - {64, 128}, at::ScalarType::Float, communicator_->device()); - - EXPECT_TRUE(local_tensor.is_cuda()); - EXPECT_EQ(local_tensor.scalar_type(), at::ScalarType::Float); - EXPECT_EQ(local_tensor.numel(), 64 * 128); - EXPECT_EQ(local_tensor.sizes()[0], 64); - EXPECT_EQ(local_tensor.sizes()[1], 128); - - SymmetricTensor sym_tensor(local_tensor); - EXPECT_EQ(sym_tensor.localTensor().numel(), 64 * 128); - EXPECT_EQ(sym_tensor.localTensor().data_ptr(), local_tensor.data_ptr()); -} - TEST_F(SymmetricTensorTest, PreallocatedTensor) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; @@ -188,6 +169,9 @@ TEST_F(SymmetricTensorTest, ContiguousView) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; } + if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { + GTEST_SKIP() << "Skipping test for Pytorch symmetric memory backend"; + } const int64_t rank = communicator_->deviceId(); const int64_t world_size = communicator_->size(); @@ -252,6 +236,7 @@ TEST_F(SymmetricTensorTest, SmallAllocation) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; } + std::cout << "Vishal chishta" << std::endl; const int64_t rank = communicator_->deviceId(); const int64_t world_size = communicator_->size(); From a9ddffdd3216b6c6d6919c51d5fc197e15ccaff2 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 20 Mar 2026 22:24:39 +0200 Subject: [PATCH 06/16] delete build file --- fbuild.sh | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100755 fbuild.sh diff --git a/fbuild.sh b/fbuild.sh deleted file mode 100755 index de4cad83241..00000000000 --- a/fbuild.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -export CC=clang-20 -export CXX=clang++-20 -export CUDAHOSTCXX=/usr/bin/clang++-20 -export LDFLAGS="-fuse-ld=mold" - -export NVFUSER_BUILD_ENABLE_PCH - -export UCC_HOME="/opt/hpcx/ucc" -export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" -export UCX_HOME="/opt/hpcx/ucx" -export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" - -# export TORCH_CUDA_ARCH_LIST="9.0" - -export NVFUSER_BUILD_WITH_UCC=1 -export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser -export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY - -# Enable debug mode, leave empty for non-debug compilation -export NVFUSER_BUILD_BUILD_TYPE=Debug -export RUN_CMAKE="" - -pip install -v -e ./python --no-build-isolation From 8e62ccce29799b9b137685b1d34897f72af56343 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Tue, 24 Mar 2026 12:48:23 +0200 Subject: [PATCH 07/16] Lint errors and review comments --- 1 | 25 ---- csrc/multidevice/communicator.cpp | 14 +- csrc/multidevice/communicator.h | 2 +- csrc/multidevice/ipc_utils.h | 5 +- csrc/multidevice/symmetric_tensor.cpp | 126 ++++++++++-------- csrc/multidevice/symmetric_tensor.h | 6 +- .../cpp/test_multidevice_symmetric_tensor.cpp | 3 +- 7 files changed, 83 insertions(+), 98 deletions(-) delete mode 100644 1 diff --git a/1 b/1 deleted file mode 100644 index de4cad83241..00000000000 --- a/1 +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -export CC=clang-20 -export CXX=clang++-20 -export CUDAHOSTCXX=/usr/bin/clang++-20 -export LDFLAGS="-fuse-ld=mold" - -export NVFUSER_BUILD_ENABLE_PCH - -export UCC_HOME="/opt/hpcx/ucc" -export UCC_DIR="/opt/hpcx/ucc/lib/cmake/ucc" -export UCX_HOME="/opt/hpcx/ucx" -export UCX_DIR="/opt/hpcx/ucx/lib/cmake/ucx" - -# export TORCH_CUDA_ARCH_LIST="9.0" - -export NVFUSER_BUILD_WITH_UCC=1 -export NVFUSER_BUILD_INSTALL_DIR=$BUILD_DIRECTORY/nvfuser -export NVFUSER_BUILD_DIR=$BUILD_DIRECTORY - -# Enable debug mode, leave empty for non-debug compilation -export NVFUSER_BUILD_BUILD_TYPE=Debug -export RUN_CMAKE="" - -pip install -v -e ./python --no-build-isolation diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index ba40835d9da..30bed276b00 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -417,7 +417,7 @@ c10d::Backend* Communicator::getBackendForTeam( #endif #if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP std::optional pg_backend = - (b == CommunicatorBackend::kNccl) + (b == CommunicatorBackend::kNccl) ? std::optional( c10d::ProcessGroup::BackendType::NCCL) : std::nullopt; @@ -459,12 +459,12 @@ void Communicator::barrier(std::optional backend) { } std::string Communicator::getSymmMemGroupKey( - std::optional backend) { -std::vector all_ranks(size_); -std::iota(all_ranks.begin(), all_ranks.end(), 0); -CommunicatorBackend b = backend.value_or(default_backend_); -(void)getBackendForTeam(all_ranks, b); -return getTeamKey(all_ranks, b); + std::optional backend) { + std::vector all_ranks(size_); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + CommunicatorBackend b = backend.value_or(default_backend_); + (void)getBackendForTeam(all_ranks, b); + return getTeamKey(all_ranks, b); } } // namespace nvfuser \ No newline at end of file diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 58ccc2b153a..e7efc8dbadf 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -123,7 +123,7 @@ class NVF_API Communicator { // Returns the world process-group name for the given backend. std::string getSymmMemGroupKey( - std::optional backend = std::nullopt); + std::optional backend = std::nullopt); // returns if a backend is available for creation bool isBackendAvailable(CommunicatorBackend backend) const { diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h index 0cfd6586e47..9346ad69e6f 100644 --- a/csrc/multidevice/ipc_utils.h +++ b/csrc/multidevice/ipc_utils.h @@ -35,8 +35,9 @@ MulticastProtocol getMulticastProtocol(); // Backend for symmetric memory allocation and rendezvous. // Native: Fuser's own CUDA VMM + IPC implementation (default, maintained). -// PyTorch*: Use PyTorch's symmetric memory (torch.distributed._symmetric_memory) -// with the given transport backend (Nccl, Nvshmem, or Cuda). +// PyTorch*: Use PyTorch's symmetric memory +// (torch.distributed._symmetric_memory) with the given transport backend (Nccl, +// Nvshmem, or Cuda). enum class SymmetricMemoryBackend { Native, PyTorchNccl, diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index b53bfbe2235..acefdca779e 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -16,8 +16,8 @@ #include "multidevice/utils.h" #ifdef NVFUSER_DISTRIBUTED -#include #include +#include #endif namespace nvfuser { @@ -25,57 +25,60 @@ namespace nvfuser { namespace { #ifdef NVFUSER_DISTRIBUTED - std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { - static std::once_flag once; - std::call_once(once, [backend]() { - const char* name = nullptr; - switch (backend) { - case SymmetricMemoryBackend::PyTorchNccl: - name = "NCCL"; - c10d::symmetric_memory::set_backend(name); - break; - case SymmetricMemoryBackend::PyTorchNvshmem: - name = "NVSHMEM"; - c10d::symmetric_memory::set_backend(name); - break; - case SymmetricMemoryBackend::PyTorchCuda: - name = "CUDA"; - break; - default: - NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); - } - }); - - Communicator& comm = Communicator::getInstance(); - NVF_CHECK(comm.is_available(), "Communicator not available for symmetric memory"); +std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { + static std::once_flag once; + std::call_once(once, [backend]() { + const char* name = nullptr; + switch (backend) { + case SymmetricMemoryBackend::PyTorchNccl: + name = "NCCL"; + c10d::symmetric_memory::set_backend(name); + break; + case SymmetricMemoryBackend::PyTorchNvshmem: + name = "NVSHMEM"; + c10d::symmetric_memory::set_backend(name); + break; + case SymmetricMemoryBackend::PyTorchCuda: + name = "CUDA"; + // set_backend(name) is not required for CUDA backend + break; + default: + NVF_ERROR(false, "Unexpected PyTorch symmetric memory backend"); + } + }); - // Always return a valid group name - if (backend != SymmetricMemoryBackend::Native) { - NVF_CHECK( - comm.isBackendAvailable(CommunicatorBackend::kNccl), - "NCCL backend is required for symmetric_memory_backend(nccl)"); - - const std::string group_name = comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); - - static std::once_flag pg0_once; - std::call_once(pg0_once, [&]() { - try { - (void)c10d::resolve_process_group("0"); - } catch (const c10::Error&) { - auto pg = c10d::resolve_process_group(group_name); - c10d::register_process_group("0", pg); - } - }); + Communicator& comm = Communicator::getInstance(); + NVF_CHECK( + comm.is_available(), "Communicator not available for symmetric memory"); - comm.barrier(CommunicatorBackend::kNccl); - return group_name; - } + // Always return a valid group name + if (backend != SymmetricMemoryBackend::Native) { + NVF_CHECK( + comm.isBackendAvailable(CommunicatorBackend::kNccl), + "NCCL backend is required for non-native symmetric memory backend"); + + const std::string group_name = + comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); + + static std::once_flag pg0_once; + std::call_once(pg0_once, [&]() { + try { + (void)c10d::resolve_process_group("0"); + } catch (const c10::Error&) { + auto pg = c10d::resolve_process_group(group_name); + c10d::register_process_group("0", pg); + } + }); - NVF_ERROR( - false, - "No c10d backend available for symmetric memory rendezvous. " - "Expected NCCL or UCC process group."); + comm.barrier(CommunicatorBackend::kNccl); + return group_name; } + + NVF_ERROR( + false, + "No c10d backend available for symmetric memory rendezvous. " + "Expected NCCL or UCC process group."); +} #endif @@ -160,7 +163,8 @@ at::Tensor SymmetricTensor::allocate( } // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name. c10::optional alloc_group_name = - (backend == SymmetricMemoryBackend::PyTorchNccl || backend == SymmetricMemoryBackend::PyTorchNvshmem) + (backend == SymmetricMemoryBackend::PyTorchNccl || + backend == SymmetricMemoryBackend::PyTorchNvshmem) ? c10::nullopt : c10::optional(group_name); return c10d::symmetric_memory::empty_strided_p2p( @@ -176,7 +180,8 @@ at::Tensor SymmetricTensor::allocate( NVF_ERROR( false, "PyTorch symmetric memory backend requires a build with " - "NVFUSER_DISTRIBUTED. Use NVFUSER_ENABLE=symmetric_memory_backend(native) " + "NVFUSER_DISTRIBUTED. Use " + "NVFUSER_ENABLE=symmetric_memory_backend(native) " "or do not set symmetric_memory_backend."); } #endif @@ -415,12 +420,13 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { return; } #ifdef NVFUSER_DISTRIBUTED - // PyTorch backend: perform rendezvous here (lazy, on first setupRemoteHandles). + // PyTorch backend: perform rendezvous here (lazy, on first + // setupRemoteHandles). SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { const std::string group_name = ensurePyTorchSymmMemBackend(backend); - torch_symm_handle_ = c10d::symmetric_memory::rendezvous( - local_tensor_, group_name); + torch_symm_handle_ = + c10d::symmetric_memory::rendezvous(local_tensor_, group_name); are_remote_tensors_setup_ = true; if (torch_symm_handle_->has_multicast_support()) { is_multicast_setup_ = true; @@ -509,7 +515,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( - rank, local_tensor_.sizes(), local_tensor_.scalar_type()); + rank, local_tensor_.sizes(), local_tensor_.scalar_type()); } #endif @@ -535,8 +541,8 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { NVF_THROW( - false, - "Contiguous view is not yet supported for PyTorch symmetric memory backend. " + "Contiguous view is not yet supported for PyTorch symmetric memory " + "backend." "Use native backend for SymmetricContiguousView."); } #endif @@ -606,8 +612,8 @@ at::Tensor SymmetricTensor::getContiguousView() const { #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { NVF_THROW( - false, - "Contiguous view is not yet supported for PyTorch symmetric memory backend."); + "Contiguous view is not yet supported for PyTorch symmetric memory " + "backend."); } #endif NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup"); @@ -625,7 +631,9 @@ void SymmetricTensor::setupMulticast( if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { if (!torch_symm_handle_) { setupRemoteHandles(tag); - NVF_CHECK(torch_symm_handle_->has_multicast_support(), "Multicast not supported"); + NVF_CHECK( + torch_symm_handle_->has_multicast_support(), + "Multicast not supported"); } return; } diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 20f4d63ccda..02e47d8a7fc 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -28,7 +28,8 @@ namespace nvfuser { // - Native (default): Fuser's own CUDA VMM + IPC implementation // - PyTorch (Nccl, Nvshmem, Cuda): Use PyTorch's symmetric memory // (torch.distributed._symmetric_memory) with the chosen transport backend. -// Select via NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda). +// Select via +// NVFUSER_ENABLE=symmetric_memory_backend(native|pytorch_nccl|pytorch_nvshmem|pytorch_cuda). // Native remains the default when the option is not set. class SymmetricTensor { public: @@ -85,7 +86,8 @@ class SymmetricTensor { bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; #ifdef NVFUSER_DISTRIBUTED - c10::intrusive_ptr torch_symm_handle_; + c10::intrusive_ptr + torch_symm_handle_; #endif }; diff --git a/tests/cpp/test_multidevice_symmetric_tensor.cpp b/tests/cpp/test_multidevice_symmetric_tensor.cpp index 020ea0b8745..e39a41861c2 100644 --- a/tests/cpp/test_multidevice_symmetric_tensor.cpp +++ b/tests/cpp/test_multidevice_symmetric_tensor.cpp @@ -5,9 +5,9 @@ * SPDX-License-Identifier: BSD-3-Clause */ // clang-format on +#include "multidevice/ipc_utils.h" #include "multidevice/symmetric_tensor.h" #include "tests/cpp/multidevice.h" -#include "multidevice/ipc_utils.h" namespace nvfuser { @@ -236,7 +236,6 @@ TEST_F(SymmetricTensorTest, SmallAllocation) { if (communicator_->size() == 1) { GTEST_SKIP() << "Skipping test for single device"; } - std::cout << "Vishal chishta" << std::endl; const int64_t rank = communicator_->deviceId(); const int64_t world_size = communicator_->size(); From 1be0134e8a6349bf227cb525b9c858ab90c8c1fa Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Tue, 24 Mar 2026 13:26:41 +0200 Subject: [PATCH 08/16] fix 3 lint errors --- csrc/multidevice/communicator.cpp | 2 +- csrc/multidevice/symmetric_tensor.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 30bed276b00..88cad27d012 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -467,4 +467,4 @@ std::string Communicator::getSymmMemGroupKey( return getTeamKey(all_ranks, b); } -} // namespace nvfuser \ No newline at end of file +} // namespace nvfuser diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index acefdca779e..2fc52284b92 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -64,7 +64,9 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { std::call_once(pg0_once, [&]() { try { (void)c10d::resolve_process_group("0"); - } catch (const c10::Error&) { + } catch (const std::exception&) { + // resolve_process_group throws c10::Error + // (derives from std::exception) auto pg = c10d::resolve_process_group(group_name); c10d::register_process_group("0", pg); } @@ -164,7 +166,7 @@ at::Tensor SymmetricTensor::allocate( // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name. c10::optional alloc_group_name = (backend == SymmetricMemoryBackend::PyTorchNccl || - backend == SymmetricMemoryBackend::PyTorchNvshmem) + backend == SymmetricMemoryBackend::PyTorchNvshmem) ? c10::nullopt : c10::optional(group_name); return c10d::symmetric_memory::empty_strided_p2p( From 3596301f42b3245f9faefdd49eed0997bddd319d Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 25 Mar 2026 10:12:59 +0200 Subject: [PATCH 09/16] Fix clang-tidy errors --- csrc/multidevice/communicator.cpp | 19 +++---- csrc/multidevice/communicator.h | 12 ----- csrc/multidevice/ipc_utils.cpp | 28 +++++----- csrc/multidevice/ipc_utils.h | 4 +- csrc/multidevice/symmetric_tensor.cpp | 77 +++++++++++++++------------ 5 files changed, 68 insertions(+), 72 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 88cad27d012..b818bbb807c 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -14,13 +14,9 @@ #include #ifdef NVFUSER_DISTRIBUTED -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP #include -#endif #include -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP #include -#endif #include #ifdef USE_C10D_NCCL #include @@ -127,7 +123,8 @@ bool parseEnv( } // retrieves master port - if ((env = std::getenv("NVFUSER_MASTER_PORT")) != nullptr) { + env = std::getenv("NVFUSER_MASTER_PORT"); + if (env != nullptr) { master_port = std::atoi(env); } else { LOG(INFO) << "The environment variable NVFUSER_MASTER_PORT has not been " @@ -254,10 +251,10 @@ void waitForDebuggerAtRanks( std::cerr << "Process " << pid << " is waiting for the debugger. To continue debugging, " << "start gdb, `attach " << pid - << "`, `set var waiting=false`, and `fini`." << std::endl; + << "`, `set var waiting=false`, and `fini`.\n"; while (waiting) { // Please change `waiting` in the debugger. } - std::cerr << "Process " << getpid() << " finished waiting." << std::endl; + std::cerr << "Process " << getpid() << " finished waiting.\n"; } if (communicator->is_available()) { @@ -369,12 +366,10 @@ void Communicator::cleanup() { } } #endif -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP for (const auto& entry : process_groups_) { c10d::unregister_process_group(entry.first); } process_groups_.clear(); -#endif backends_.clear(); } @@ -400,7 +395,7 @@ c10d::Backend* Communicator::getBackendForTeam( #ifdef NVFUSER_DISTRIBUTED backends_[team_key] = [&]() -> c10::intrusive_ptr { // check that the caller's rank belongs to the requested team - auto rank_it = std::find(team.begin(), team.end(), deviceId()); + auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); if (rank_it == team.end()) { return nullptr; } @@ -415,14 +410,13 @@ c10d::Backend* Communicator::getBackendForTeam( #else backends_[team_key] = nullptr; #endif -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP std::optional pg_backend = (b == CommunicatorBackend::kNccl) ? std::optional( c10d::ProcessGroup::BackendType::NCCL) : std::nullopt; if (backends_[team_key] != nullptr && pg_backend.has_value()) { - auto rank_it = std::find(team.begin(), team.end(), deviceId()); + auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); RankType team_rank = std::distance(team.begin(), rank_it); auto pg = c10::make_intrusive( @@ -436,7 +430,6 @@ c10d::Backend* Communicator::getBackendForTeam( c10d::register_process_group(team_key, pg); process_groups_[team_key] = std::move(pg); } -#endif } return backends_.at(team_key).get(); } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index e7efc8dbadf..0b3733505bf 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -11,19 +11,9 @@ #include #include -#if defined(NVFUSER_DISTRIBUTED) && \ - __has_include() && \ - __has_include() -#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 1 -#else -#define NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP 0 -#endif - #ifdef NVFUSER_DISTRIBUTED #include -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP #include -#endif #include #include #else @@ -168,11 +158,9 @@ class NVF_API Communicator { c10::intrusive_ptr store_; // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; -#if NVFUSER_CAN_REGISTER_C10D_PROCESS_GROUP // c10d process-group wrappers registered for symmetric-memory rendezvous. std::unordered_map> process_groups_; -#endif }; } // namespace nvfuser diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp index 01bdf949044..ab1ad413c87 100644 --- a/csrc/multidevice/ipc_utils.cpp +++ b/csrc/multidevice/ipc_utils.cpp @@ -38,7 +38,7 @@ int createIpcSocket(const std::string& path) { int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno)); - struct sockaddr_un addr; + struct sockaddr_un addr {}; setupSockAddr(addr, path); // For abstract namespace, len is usually calculated specifically, but for @@ -69,7 +69,7 @@ void sendFd( int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno)); - struct sockaddr_un addr; + struct sockaddr_un addr {}; setupSockAddr(addr, path); socklen_t addrlen = sizeof(addr.sun_family) + path.length(); @@ -77,8 +77,9 @@ void sendFd( int ret = -1; for (int i = 0; i < 100; ++i) { ret = connect(sockfd, (struct sockaddr*)&addr, addrlen); - if (ret == 0) + if (ret == 0) { break; + } usleep(10000); // 10ms } if (ret < 0) { @@ -86,14 +87,16 @@ void sendFd( NVF_CHECK(false, "Failed to connect to ", path, ": ", strerror(errno)); } - struct msghdr msg = {0}; - struct cmsghdr* cmsg; + struct msghdr msg {}; + struct cmsghdr* cmsg = nullptr; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays) char buf[CMSG_SPACE(sizeof(int))]; // If no header data, send at least one byte char dummy = '.'; - struct iovec iov; + struct iovec iov {}; if (header_data && header_len > 0) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) iov.iov_base = const_cast(header_data); iov.iov_len = header_len; } else { @@ -121,21 +124,22 @@ void sendFd( } int recvFd(int socket_fd, void* header_data, size_t header_len) { - struct sockaddr_un client_addr; + struct sockaddr_un client_addr {}; socklen_t client_len = sizeof(client_addr); int client_fd = accept(socket_fd, (struct sockaddr*)&client_addr, &client_len); NVF_CHECK(client_fd >= 0, "Failed to accept connection: ", strerror(errno)); - struct msghdr msg = {0}; - struct cmsghdr* cmsg; + struct msghdr msg {}; + struct cmsghdr* cmsg = nullptr; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays) char buf[CMSG_SPACE(sizeof(int))]; // If header_len > 0, we expect that much data. // Note: recvmsg might return fewer bytes if strict requirements aren't met, // but for local unix sockets with small payloads, it usually delivers all. - char dummy; - struct iovec iov; + char dummy = '.'; + struct iovec iov {}; if (header_data && header_len > 0) { iov.iov_base = header_data; iov.iov_len = header_len; @@ -168,7 +172,7 @@ int recvFd(int socket_fd, void* header_data, size_t header_len) { int recv_fd = -1; cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg != NULL && cmsg->cmsg_len == CMSG_LEN(sizeof(int))) { + if (cmsg != nullptr && cmsg->cmsg_len == CMSG_LEN(sizeof(int))) { if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { memcpy(&recv_fd, CMSG_DATA(cmsg), sizeof(int)); } diff --git a/csrc/multidevice/ipc_utils.h b/csrc/multidevice/ipc_utils.h index 9346ad69e6f..f0b69fcdbf1 100644 --- a/csrc/multidevice/ipc_utils.h +++ b/csrc/multidevice/ipc_utils.h @@ -29,7 +29,7 @@ const T& fromBytes(const std::vector& bytes) { // IPC Utils for sharing file descriptors -enum class MulticastProtocol { Memcpy, Multimem, BatchMemcpy }; +enum class MulticastProtocol : uint8_t { Memcpy, Multimem, BatchMemcpy }; MulticastProtocol getMulticastProtocol(); @@ -38,7 +38,7 @@ MulticastProtocol getMulticastProtocol(); // PyTorch*: Use PyTorch's symmetric memory // (torch.distributed._symmetric_memory) with the given transport backend (Nccl, // Nvshmem, or Cuda). -enum class SymmetricMemoryBackend { +enum class SymmetricMemoryBackend : uint8_t { Native, PyTorchNccl, PyTorchNvshmem, diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 2fc52284b92..4af6f917673 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -65,7 +65,7 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { try { (void)c10d::resolve_process_group("0"); } catch (const std::exception&) { - // resolve_process_group throws c10::Error + // resolve_process_group throws c10d Error // (derives from std::exception) auto pg = c10d::resolve_process_group(group_name); c10d::register_process_group("0", pg); @@ -83,7 +83,6 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { } #endif - // Returns the allocation granularity for symmetric memory. // - query_mcast_granularity: if true, considers multicast granularity // - query_mcast_recommended_granularity: if true, uses recommended (larger) @@ -99,7 +98,7 @@ int64_t getGranularityForSymmetricMemory( #if (CUDA_VERSION >= NVF_MIN_CUDA_FOR_MCAST) if (!query_mcast_granularity) { - return alloc_granularity; + return static_cast(alloc_granularity); } // Check if device supports multicast before querying multicast granularity @@ -110,7 +109,7 @@ int64_t getGranularityForSymmetricMemory( prop.location.id)); if (is_multicast_supported == 0) { - return alloc_granularity; + return static_cast(alloc_granularity); } // Device supports multicast, query multicast granularity @@ -138,7 +137,7 @@ int64_t getGranularityForSymmetricMemory( granularity = mcast_rec_granularity; } - return std::max(alloc_granularity, granularity); + return static_cast(std::max(alloc_granularity, granularity)); #else (void)requested_size_bytes; (void)query_mcast_granularity; @@ -166,7 +165,7 @@ at::Tensor SymmetricTensor::allocate( // NCCLSymmetricMemoryAllocator::alloc must not be called with a group_name. c10::optional alloc_group_name = (backend == SymmetricMemoryBackend::PyTorchNccl || - backend == SymmetricMemoryBackend::PyTorchNvshmem) + backend == SymmetricMemoryBackend::PyTorchNvshmem) ? c10::nullopt : c10::optional(group_name); return c10d::symmetric_memory::empty_strided_p2p( @@ -188,25 +187,26 @@ at::Tensor SymmetricTensor::allocate( } #endif - int is_vmm_supported; + int is_vmm_supported = 0; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( &is_vmm_supported, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device.index())); NVF_ERROR(is_vmm_supported, "Device does not support VMM"); - const int64_t numel = std::accumulate( - sizes.begin(), sizes.end(), 1, std::multiplies()); - const int64_t element_size = c10::elementSize(dtype); + const int64_t numel = static_cast( + std::accumulate(sizes.begin(), sizes.end(), 1, std::multiplies<>())); + const int64_t element_size = static_cast(c10::elementSize(dtype)); const int64_t alloc_size = numel * element_size; CUmemAllocationProp prop{}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = static_cast(device.index()); + prop.location.id = + static_cast(static_cast(device.index())); prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - size_t granularity = + int64_t granularity = getGranularityForSymmetricMemory(prop, static_cast(alloc_size)); int64_t rounded_size = ((alloc_size + granularity - 1) / granularity) * granularity; @@ -221,20 +221,22 @@ at::Tensor SymmetricTensor::allocate( CUmemAccessDesc access{}; access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access.location.id = static_cast(device.index()); + access.location.id = + static_cast(static_cast(device.index())); access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(ptr, rounded_size, &access, 1)); std::vector strides(sizes.size()); strides.back() = 1; - for (int64_t i = strides.size() - 2; i >= 0; --i) { + for (size_t i = strides.size() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * sizes[i + 1]; } return at::from_blob( - (void*)ptr, + // NOLINTNEXTLINE(performance-no-int-to-ptr) + reinterpret_cast(ptr), sizes, - std::move(strides), + strides, [=](void* ptr) { cuMemUnmap((CUdeviceptr)(ptr), rounded_size); cuMemAddressFree((CUdeviceptr)(ptr), rounded_size); @@ -247,7 +249,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) { if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { return ""; } - int is_vmm_supported; + int is_vmm_supported = 0; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( &is_vmm_supported, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, @@ -260,7 +262,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) { CUmemLocation location{}; location.type = CU_MEM_LOCATION_TYPE_DEVICE; - location.id = Communicator::getInstance().local_rank(); + location.id = static_cast(Communicator::getInstance().local_rank()); unsigned long long flags = 0; NVFUSER_CUDA_SAFE_CALL(cuMemGetAccess(&flags, &location, ptr)); if (flags != CU_MEM_ACCESS_FLAGS_PROT_READWRITE) { @@ -269,7 +271,7 @@ std::string SymmetricTensor::validate(at::Tensor tensor) { CUmemGenericAllocationHandle alloc_handle = 0; NVFUSER_CUDA_SAFE_CALL( - cuMemRetainAllocationHandle(&alloc_handle, (void*)ptr)); + cuMemRetainAllocationHandle(&alloc_handle, tensor.data_ptr())); CUmemAllocationProp prop{}; NVFUSER_CUDA_SAFE_CALL( @@ -358,9 +360,9 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) alloc_handles_.resize(world_size_); remote_ptrs_.resize(world_size_); - CUmemGenericAllocationHandle local_handle; - NVFUSER_CUDA_SAFE_CALL(cuMemRetainAllocationHandle( - &local_handle, reinterpret_cast(local_ptr))); + CUmemGenericAllocationHandle local_handle = 0; + NVFUSER_CUDA_SAFE_CALL( + cuMemRetainAllocationHandle(&local_handle, local_tensor_.data_ptr())); alloc_handles_[my_device_id_] = local_handle; remote_ptrs_[my_device_id_] = local_ptr; @@ -386,8 +388,9 @@ SymmetricTensor::~SymmetricTensor() { // for now. cuMulticastUnbind(mcast_handle_, cu_dev_, 0, aligned_size_); cuMemRelease(mcast_handle_); } - if (peer_fd_ >= 0) + if (peer_fd_ >= 0) { close(peer_fd_); + } } #endif @@ -446,7 +449,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { NVFUSER_CUDA_SAFE_CALL(cuMemGetAddressRange(&base_ptr, &va_size, local_ptr)); size_t offset = local_ptr - base_ptr; - int shared_fd; + int shared_fd = -1; NVFUSER_CUDA_SAFE_CALL(cuMemExportToShareableHandle( &shared_fd, local_handle, @@ -475,9 +478,10 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { NVF_CHECK( sender_rank >= 0 && sender_rank < world_size_, "Invalid sender rank"); - CUmemGenericAllocationHandle peer_handle; + CUmemGenericAllocationHandle peer_handle = 0; NVFUSER_CUDA_SAFE_CALL(cuMemImportFromShareableHandle( &peer_handle, + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(static_cast(local_fd)), CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); @@ -517,12 +521,15 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { #ifdef NVFUSER_DISTRIBUTED if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( - rank, local_tensor_.sizes(), local_tensor_.scalar_type()); + static_cast(rank), + local_tensor_.sizes(), + local_tensor_.scalar_type()); } #endif NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup"); return at::from_blob( + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(remote_ptrs_[rank]), local_tensor_.sizes(), local_tensor_.strides(), @@ -557,7 +564,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { const int64_t world_size = comm.size(); size_t total_size = aligned_size_ * world_size; - CUdeviceptr base; + CUdeviceptr base = 0; NVFUSER_CUDA_SAFE_CALL( cuMemAddressReserve(&base, total_size, granularity_, 0, 0)); @@ -583,13 +590,15 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { NVF_CHECK( aligned_size_ % local_tensor_.element_size() == 0, "Aligned size must be divisible by element size"); - strides.push_back(aligned_size_ / local_tensor_.element_size()); + strides.push_back( + static_cast(aligned_size_ / local_tensor_.element_size())); for (int64_t s : local_tensor_.strides()) { strides.push_back(s); } size_t map_size = aligned_size_; contiguous_view_ = at::from_blob( + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(base), sizes, strides, @@ -644,14 +653,14 @@ void SymmetricTensor::setupMulticast( const int64_t my_rank = comm.deviceId(); const int64_t local_rank = comm.local_rank(); - int is_multicast_supported; + int is_multicast_supported = 0; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( &is_multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, local_rank)); NVF_CHECK(is_multicast_supported, "Multicast not supported"); - exporter_rank_ = exporter_rank; + exporter_rank_ = static_cast(exporter_rank); CUmulticastObjectProp mcast_prop{}; mcast_prop.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; @@ -682,6 +691,7 @@ void SymmetricTensor::setupMulticast( NVFUSER_CUDA_SAFE_CALL(cuMemImportFromShareableHandle( &mcast_handle_, + // NOLINTNEXTLINE(performance-no-int-to-ptr) reinterpret_cast(static_cast(peer_fd_)), CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); } else { @@ -700,8 +710,8 @@ void SymmetricTensor::setupMulticast( NVFUSER_CUDA_SAFE_CALL(cuMulticastAddDevice(mcast_handle_, cu_dev_)); CUdeviceptr local_ptr = remote_ptrs_[my_device_id_]; - CUdeviceptr base_ptr; - size_t base_size; + CUdeviceptr base_ptr = 0; + size_t base_size = 0; NVFUSER_CUDA_SAFE_CALL( cuMemGetAddressRange(&base_ptr, &base_size, local_ptr)); size_t mem_offset = static_cast(local_ptr - base_ptr); @@ -721,7 +731,7 @@ void SymmetricTensor::setupMulticast( aligned_size_, 0)); - CUdeviceptr mc_ptr; + CUdeviceptr mc_ptr = 0; NVFUSER_CUDA_SAFE_CALL( cuMemAddressReserve(&mc_ptr, aligned_size_, granularity_, 0, 0)); NVFUSER_CUDA_SAFE_CALL(cuMemMap(mc_ptr, aligned_size_, 0, mcast_handle_, 0)); @@ -732,6 +742,7 @@ void SymmetricTensor::setupMulticast( access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1)); + // NOLINTNEXTLINE(performance-no-int-to-ptr) mc_ptr_ = reinterpret_cast(mc_ptr + offset_diff); mc_base_ptr_ = mc_ptr; is_multicast_setup_ = true; From 9b059152218c2b2f2e13431f08e9ff8ca3ef48f9 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 25 Mar 2026 10:38:01 +0200 Subject: [PATCH 10/16] Fixing outdated lint errors --- csrc/multidevice/communicator.cpp | 6 ++++-- csrc/multidevice/communicator.h | 2 ++ csrc/multidevice/ipc_utils.cpp | 14 +++++++------- csrc/multidevice/symmetric_tensor.cpp | 2 +- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index b818bbb807c..5b956f91245 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -352,12 +352,13 @@ void Communicator::cleanup() { store_ = nullptr; -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_C10D_NCCL) +#if defined(NVFUSER_DISTRIBUTED) +#if defined(USE_C10D_NCCL) // Sort backends to work around a NCCL bug (nvbugs/4889623). Closing backends // in different orders between ranks have been causing a hang. std::vector>> keyed_backends(backends_.begin(), backends_.end()); - std::sort(keyed_backends.begin(), keyed_backends.end()); + std::ranges::sort(keyed_backends.begin(), keyed_backends.end()); for (auto& [key, backend] : keyed_backends) { // Call shutdown before destructing a ProcessGroupNCCL as instructed by // https://github.com/pytorch/pytorch/blob/e62073d7997c9e63896cb5289ffd0874a8cc1838/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp#L1164-L1170. @@ -370,6 +371,7 @@ void Communicator::cleanup() { c10d::unregister_process_group(entry.first); } process_groups_.clear(); +#endif backends_.clear(); } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 0b3733505bf..b2ff5c89150 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -159,8 +159,10 @@ class NVF_API Communicator { // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; // c10d process-group wrappers registered for symmetric-memory rendezvous. +#ifdef NVFUSER_DISTRIBUTED std::unordered_map> process_groups_; +#endif }; } // namespace nvfuser diff --git a/csrc/multidevice/ipc_utils.cpp b/csrc/multidevice/ipc_utils.cpp index ab1ad413c87..ae5461e4e70 100644 --- a/csrc/multidevice/ipc_utils.cpp +++ b/csrc/multidevice/ipc_utils.cpp @@ -38,7 +38,7 @@ int createIpcSocket(const std::string& path) { int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno)); - struct sockaddr_un addr {}; + struct sockaddr_un addr{}; setupSockAddr(addr, path); // For abstract namespace, len is usually calculated specifically, but for @@ -69,7 +69,7 @@ void sendFd( int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); NVF_CHECK(sockfd >= 0, "Failed to create socket: ", strerror(errno)); - struct sockaddr_un addr {}; + struct sockaddr_un addr{}; setupSockAddr(addr, path); socklen_t addrlen = sizeof(addr.sun_family) + path.length(); @@ -87,14 +87,14 @@ void sendFd( NVF_CHECK(false, "Failed to connect to ", path, ": ", strerror(errno)); } - struct msghdr msg {}; + struct msghdr msg{}; struct cmsghdr* cmsg = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays) char buf[CMSG_SPACE(sizeof(int))]; // If no header data, send at least one byte char dummy = '.'; - struct iovec iov {}; + struct iovec iov{}; if (header_data && header_len > 0) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) iov.iov_base = const_cast(header_data); @@ -124,13 +124,13 @@ void sendFd( } int recvFd(int socket_fd, void* header_data, size_t header_len) { - struct sockaddr_un client_addr {}; + struct sockaddr_un client_addr{}; socklen_t client_len = sizeof(client_addr); int client_fd = accept(socket_fd, (struct sockaddr*)&client_addr, &client_len); NVF_CHECK(client_fd >= 0, "Failed to accept connection: ", strerror(errno)); - struct msghdr msg {}; + struct msghdr msg{}; struct cmsghdr* cmsg = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays) char buf[CMSG_SPACE(sizeof(int))]; @@ -139,7 +139,7 @@ int recvFd(int socket_fd, void* header_data, size_t header_len) { // Note: recvmsg might return fewer bytes if strict requirements aren't met, // but for local unix sockets with small payloads, it usually delivers all. char dummy = '.'; - struct iovec iov {}; + struct iovec iov{}; if (header_data && header_len > 0) { iov.iov_base = header_data; iov.iov_len = header_len; diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index 4af6f917673..d7778122d03 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -228,7 +228,7 @@ at::Tensor SymmetricTensor::allocate( std::vector strides(sizes.size()); strides.back() = 1; - for (size_t i = strides.size() - 2; i >= 0; --i) { + for (int64_t i = static_cast(strides.size()) - 2; i >= 0; --i) { strides[i] = strides[i + 1] * sizes[i + 1]; } From 6147139a29396f129737d3bd5e0aa886df48690c Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Wed, 25 Mar 2026 18:43:09 +0200 Subject: [PATCH 11/16] Add torch distributed gaurd --- csrc/multidevice/communicator.cpp | 2 ++ csrc/multidevice/communicator.h | 4 ++-- csrc/multidevice/symmetric_tensor.cpp | 22 +++++++++++----------- csrc/multidevice/symmetric_tensor.h | 4 ++-- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index 5b956f91245..c2bd92c4c26 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -412,6 +412,7 @@ c10d::Backend* Communicator::getBackendForTeam( #else backends_[team_key] = nullptr; #endif +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) std::optional pg_backend = (b == CommunicatorBackend::kNccl) ? std::optional( @@ -432,6 +433,7 @@ c10d::Backend* Communicator::getBackendForTeam( c10d::register_process_group(team_key, pg); process_groups_[team_key] = std::move(pg); } +#endif } return backends_.at(team_key).get(); } diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index b2ff5c89150..85406c4d906 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -11,7 +11,7 @@ #include #include -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) #include #include #include @@ -159,7 +159,7 @@ class NVF_API Communicator { // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; // c10d process-group wrappers registered for symmetric-memory rendezvous. -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) std::unordered_map> process_groups_; #endif diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index d7778122d03..c40dc48e87b 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -15,7 +15,7 @@ #include "multidevice/ipc_utils.h" #include "multidevice/utils.h" -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) #include #include #endif @@ -24,7 +24,7 @@ namespace nvfuser { namespace { -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { static std::once_flag once; std::call_once(once, [backend]() { @@ -154,7 +154,7 @@ at::Tensor SymmetricTensor::allocate( at::Device device) { SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (backend != SymmetricMemoryBackend::Native) { const std::string group_name = ensurePyTorchSymmMemBackend(backend); std::vector strides(sizes.size()); @@ -181,7 +181,7 @@ at::Tensor SymmetricTensor::allocate( NVF_ERROR( false, "PyTorch symmetric memory backend requires a build with " - "NVFUSER_DISTRIBUTED. Use " + "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use " "NVFUSER_ENABLE=symmetric_memory_backend(native) " "or do not set symmetric_memory_backend."); } @@ -317,7 +317,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) "Expected CUDA tensor, got: ", local_tensor.device()); -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { Communicator& comm = Communicator::getInstance(); @@ -369,7 +369,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) } SymmetricTensor::~SymmetricTensor() { -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return; // PyTorch backend: no native VMM cleanup } @@ -424,7 +424,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { if (are_remote_tensors_setup_ == true) { return; } -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) // PyTorch backend: perform rendezvous here (lazy, on first // setupRemoteHandles). SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); @@ -518,7 +518,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { return local_tensor_; } -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( static_cast(rank), @@ -547,7 +547,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { if (is_contiguous_view_setup_) { return; } -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " @@ -620,7 +620,7 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { } at::Tensor SymmetricTensor::getContiguousView() const { -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " @@ -638,7 +638,7 @@ void SymmetricTensor::setupMulticast( if (is_multicast_setup_) { return; } -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { if (!torch_symm_handle_) { setupRemoteHandles(tag); diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 02e47d8a7fc..a2ab9e2d2ef 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -10,7 +10,7 @@ #include #include -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) #include #endif @@ -85,7 +85,7 @@ class SymmetricTensor { int peer_fd_{-1}; bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; -#ifdef NVFUSER_DISTRIBUTED +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) c10::intrusive_ptr torch_symm_handle_; #endif From af128e412071af04ddbaec9b9bbb05f10d7f7d47 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 3 Apr 2026 07:48:54 +0300 Subject: [PATCH 12/16] Address pending review cmnts --- csrc/multidevice/communicator.cpp | 41 +++++++++++------------ csrc/multidevice/communicator.h | 8 +++-- csrc/multidevice/symmetric_tensor.cpp | 47 ++++++++++++++++----------- csrc/multidevice/symmetric_tensor.h | 2 +- 4 files changed, 55 insertions(+), 43 deletions(-) diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index c2bd92c4c26..b836d9b2394 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -334,6 +334,15 @@ Communicator& Communicator::getInstance() { return *communicator; } +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) +void Communicator::registerProcessGroup( + const std::string& name, + const c10::intrusive_ptr& pg) { + c10d::register_process_group(name, pg); + process_groups_[name] = pg; +} +#endif + void Communicator::cleanup() { static bool cleaned_up = false; NVF_CHECK( @@ -367,10 +376,12 @@ void Communicator::cleanup() { } } #endif +#if defined(USE_DISTRIBUTED) for (const auto& entry : process_groups_) { c10d::unregister_process_group(entry.first); } process_groups_.clear(); +#endif #endif backends_.clear(); } @@ -412,13 +423,10 @@ c10d::Backend* Communicator::getBackendForTeam( #else backends_[team_key] = nullptr; #endif + } #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) - std::optional pg_backend = - (b == CommunicatorBackend::kNccl) - ? std::optional( - c10d::ProcessGroup::BackendType::NCCL) - : std::nullopt; - if (backends_[team_key] != nullptr && pg_backend.has_value()) { + if (process_groups_.find(team_key) == process_groups_.end()) { + if (b == CommunicatorBackend::kNccl) { auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); RankType team_rank = std::distance(team.begin(), rank_it); @@ -426,15 +434,17 @@ c10d::Backend* Communicator::getBackendForTeam( c10::make_intrusive(team_key, store_), team_rank, static_cast(team.size())); - pg->setBackend(c10::DeviceType::CUDA, *pg_backend, backends_[team_key]); - pg->setDefaultBackend(*pg_backend); + pg->setBackend( + c10::DeviceType::CUDA, + c10d::ProcessGroup::BackendType::NCCL, + backends_[team_key]); + pg->setDefaultBackend(c10d::ProcessGroup::BackendType::NCCL); pg->setGroupName(team_key); - c10d::register_process_group(team_key, pg); - process_groups_[team_key] = std::move(pg); + registerProcessGroup(team_key, pg); } -#endif } +#endif return backends_.at(team_key).get(); } @@ -455,13 +465,4 @@ void Communicator::barrier(std::optional backend) { getWorld(backend)->barrier(options)->wait(); } -std::string Communicator::getSymmMemGroupKey( - std::optional backend) { - std::vector all_ranks(size_); - std::iota(all_ranks.begin(), all_ranks.end(), 0); - CommunicatorBackend b = backend.value_or(default_backend_); - (void)getBackendForTeam(all_ranks, b); - return getTeamKey(all_ranks, b); -} - } // namespace nvfuser diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 85406c4d906..35391f37580 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -111,9 +111,11 @@ class NVF_API Communicator { c10d::Backend* getWorld( std::optional backend = std::nullopt); - // Returns the world process-group name for the given backend. - std::string getSymmMemGroupKey( - std::optional backend = std::nullopt); +#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + void registerProcessGroup( + const std::string& name, + const c10::intrusive_ptr& pg); +#endif // returns if a backend is available for creation bool isBackendAvailable(CommunicatorBackend backend) const { diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index c40dc48e87b..ad9266b7952 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -57,22 +57,30 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { comm.isBackendAvailable(CommunicatorBackend::kNccl), "NCCL backend is required for non-native symmetric memory backend"); - const std::string group_name = - comm.getSymmMemGroupKey(CommunicatorBackend::kNccl); - - static std::once_flag pg0_once; - std::call_once(pg0_once, [&]() { - try { - (void)c10d::resolve_process_group("0"); - } catch (const std::exception&) { - // resolve_process_group throws c10d Error - // (derives from std::exception) - auto pg = c10d::resolve_process_group(group_name); - c10d::register_process_group("0", pg); - } - }); + std::vector all_ranks(comm.size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); + std::string group_name = std::accumulate( + std::begin(all_ranks), + std::end(all_ranks), + std::string("nccl"), + [](const std::string& a, const RankType& b) { + return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b); + }); + if (backend == SymmetricMemoryBackend::PyTorchNvshmem) { + static std::once_flag pg0_once; + std::call_once(pg0_once, [&]() { + try { + (void)c10d::resolve_process_group("0"); + } catch (const std::exception&) { + // resolve_process_group throws c10d Error + // (derives from std::exception) + auto pg = c10d::resolve_process_group(group_name); + comm.registerProcessGroup("0", pg); + } + }); + } - comm.barrier(CommunicatorBackend::kNccl); return group_name; } @@ -424,23 +432,24 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { if (are_remote_tensors_setup_ == true) { return; } + Communicator& comm = Communicator::getInstance(); #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) // PyTorch backend: perform rendezvous here (lazy, on first // setupRemoteHandles). SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { const std::string group_name = ensurePyTorchSymmMemBackend(backend); + comm.barrier(CommunicatorBackend::kNccl); torch_symm_handle_ = c10d::symmetric_memory::rendezvous(local_tensor_, group_name); are_remote_tensors_setup_ = true; if (torch_symm_handle_->has_multicast_support()) { is_multicast_setup_ = true; - mc_ptr_ = torch_symm_handle_->get_multicast_ptr(); + multicast_ptr_ = torch_symm_handle_->get_multicast_ptr(); } return; } #endif - Communicator& comm = Communicator::getInstance(); CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_]; CUdeviceptr local_ptr = remote_ptrs_[my_device_id_]; @@ -540,7 +549,7 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { void* SymmetricTensor::multicastPtr() const { NVF_CHECK(is_multicast_setup_, "Multicast not setup"); - return mc_ptr_; + return multicast_ptr_; } void SymmetricTensor::setupContiguousView(const std::string& tag) { @@ -743,7 +752,7 @@ void SymmetricTensor::setupMulticast( NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1)); // NOLINTNEXTLINE(performance-no-int-to-ptr) - mc_ptr_ = reinterpret_cast(mc_ptr + offset_diff); + multicast_ptr_ = reinterpret_cast(mc_ptr + offset_diff); mc_base_ptr_ = mc_ptr; is_multicast_setup_ = true; diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index a2ab9e2d2ef..92af0d4f5f6 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -79,7 +79,7 @@ class SymmetricTensor { bool is_multicast_setup_ = false; CUmemGenericAllocationHandle mcast_handle_{}; CUdevice cu_dev_{}; - void* mc_ptr_{nullptr}; + void* multicast_ptr_{nullptr}; CUdeviceptr mc_base_ptr_{0}; int exporter_rank_{-1}; int peer_fd_{-1}; From 828573d3b4af61f4b84da292ee901f4c1cb14492 Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 10 Apr 2026 14:33:42 +0300 Subject: [PATCH 13/16] Add mocks for c10d --- csrc/multidevice/c10d_mock.h | 58 ++++++++++++- csrc/multidevice/communicator.cpp | 21 +++-- csrc/multidevice/communicator.h | 4 +- csrc/multidevice/symmetric_tensor.cpp | 120 ++++++++++++-------------- csrc/multidevice/symmetric_tensor.h | 4 +- 5 files changed, 130 insertions(+), 77 deletions(-) diff --git a/csrc/multidevice/c10d_mock.h b/csrc/multidevice/c10d_mock.h index 33a061e1481..7a392158ace 100644 --- a/csrc/multidevice/c10d_mock.h +++ b/csrc/multidevice/c10d_mock.h @@ -20,8 +20,12 @@ #pragma once +#include #include #include +#include +#include +#include #include namespace c10d { @@ -34,7 +38,7 @@ class Work : public torch::CustomClassHolder { }; struct ReduceOp : torch::CustomClassHolder { - enum RedOpType { + enum RedOpType : std::uint8_t { SUM, AVG, PRODUCT, @@ -211,4 +215,56 @@ class TCPStore : public torch::CustomClassHolder { } }; +class ProcessGroup : public torch::CustomClassHolder { + public: +}; + +inline c10::intrusive_ptr resolve_process_group( + const std::string&) { + return c10::make_intrusive(); +} + +inline void register_process_group( + const std::string&, + const c10::intrusive_ptr&) {} + } // namespace c10d + +namespace c10d::symmetric_memory { + +class SymmetricMemory : public torch::CustomClassHolder { + public: + ~SymmetricMemory() override = default; + virtual bool has_multicast_support() { + return false; + } + virtual void* get_multicast_ptr() { + return nullptr; + } + at::Tensor get_remote_tensor( + int peer, + c10::IntArrayRef sizes, + c10::ScalarType dtype) { + return at::empty(sizes, at::TensorOptions().dtype(dtype)); + } +}; + +inline void set_backend(const std::string&) {} + +inline at::Tensor empty_strided_p2p( + c10::IntArrayRef size, + c10::IntArrayRef stride, + c10::ScalarType dtype, + c10::Device device, + const std::optional& group_name, + std::optional) { + return at::empty(size, at::TensorOptions().dtype(dtype)); +} + +inline c10::intrusive_ptr rendezvous( + const at::Tensor&, + const std::optional& = std::nullopt) { + return c10::make_intrusive(); +} + +} // namespace c10d::symmetric_memory diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index b836d9b2394..ad953c1cffa 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -334,14 +334,14 @@ Communicator& Communicator::getInstance() { return *communicator; } -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) +// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) void Communicator::registerProcessGroup( const std::string& name, const c10::intrusive_ptr& pg) { c10d::register_process_group(name, pg); process_groups_[name] = pg; } -#endif +// #endif void Communicator::cleanup() { static bool cleaned_up = false; @@ -402,16 +402,20 @@ c10d::Backend* Communicator::getBackendForTeam( // generate a string key which is unique to the team // create the team and cache it std::string team_key = prefix + getTeamKey(team, b); + auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); + if (rank_it == team.end()) { + return nullptr; + } // check if backend associated with the team is present in the cache if (backends_.find(team_key) == backends_.end()) { // create the backend and cache it #ifdef NVFUSER_DISTRIBUTED backends_[team_key] = [&]() -> c10::intrusive_ptr { // check that the caller's rank belongs to the requested team - auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); - if (rank_it == team.end()) { - return nullptr; - } + // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); + // if (rank_it == team.end()) { + // return nullptr; + // } // retrieve the caller's rank index/position in the team RankType team_rank = std::distance(team.begin(), rank_it); return createBackend( @@ -427,7 +431,10 @@ c10d::Backend* Communicator::getBackendForTeam( #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (process_groups_.find(team_key) == process_groups_.end()) { if (b == CommunicatorBackend::kNccl) { - auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); + // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); + // if (rank_it == team.end()) { + // return nullptr; + // } RankType team_rank = std::distance(team.begin(), rank_it); auto pg = c10::make_intrusive( diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 35391f37580..72017dde4a1 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -111,11 +111,11 @@ class NVF_API Communicator { c10d::Backend* getWorld( std::optional backend = std::nullopt); -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) void registerProcessGroup( const std::string& name, const c10::intrusive_ptr& pg); -#endif + // #endif // returns if a backend is available for creation bool isBackendAvailable(CommunicatorBackend backend) const { diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index ad9266b7952..68cf874e6ec 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -24,8 +24,8 @@ namespace nvfuser { namespace { -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) -std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { +// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) +std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) { static std::once_flag once; std::call_once(once, [backend]() { const char* name = nullptr; @@ -51,45 +51,37 @@ std::string ensurePyTorchSymmMemBackend(SymmetricMemoryBackend backend) { NVF_CHECK( comm.is_available(), "Communicator not available for symmetric memory"); - // Always return a valid group name - if (backend != SymmetricMemoryBackend::Native) { - NVF_CHECK( - comm.isBackendAvailable(CommunicatorBackend::kNccl), - "NCCL backend is required for non-native symmetric memory backend"); - - std::vector all_ranks(comm.size()); - std::iota(all_ranks.begin(), all_ranks.end(), 0); - (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); - std::string group_name = std::accumulate( - std::begin(all_ranks), - std::end(all_ranks), - std::string("nccl"), - [](const std::string& a, const RankType& b) { - return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b); - }); - if (backend == SymmetricMemoryBackend::PyTorchNvshmem) { - static std::once_flag pg0_once; - std::call_once(pg0_once, [&]() { - try { - (void)c10d::resolve_process_group("0"); - } catch (const std::exception&) { - // resolve_process_group throws c10d Error - // (derives from std::exception) - auto pg = c10d::resolve_process_group(group_name); - comm.registerProcessGroup("0", pg); - } + NVF_CHECK( + comm.isBackendAvailable(CommunicatorBackend::kNccl), + "NCCL backend is required for non-native symmetric memory backend"); + + std::vector all_ranks(comm.size()); + std::iota(all_ranks.begin(), all_ranks.end(), 0); + (void)comm.getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); + std::string group_name = std::accumulate( + std::begin(all_ranks), + std::end(all_ranks), + std::string("nccl"), + [](const std::string& a, const RankType& b) { + return a.empty() ? std::to_string(b) : a + ',' + std::to_string(b); }); - } - - return group_name; + if (backend == SymmetricMemoryBackend::PyTorchNvshmem) { + static std::once_flag pg0_once; + std::call_once(pg0_once, [&]() { + try { + (void)c10d::resolve_process_group("0"); + } catch (const std::exception&) { + // resolve_process_group throws c10d Error + // (derives from std::exception) + auto pg = c10d::resolve_process_group(group_name); + comm.registerProcessGroup("0", pg); + } + }); } - NVF_ERROR( - false, - "No c10d backend available for symmetric memory rendezvous. " - "Expected NCCL or UCC process group."); + return group_name; } -#endif +// #endif // Returns the allocation granularity for symmetric memory. // - query_mcast_granularity: if true, considers multicast granularity @@ -162,9 +154,9 @@ at::Tensor SymmetricTensor::allocate( at::Device device) { SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (backend != SymmetricMemoryBackend::Native) { - const std::string group_name = ensurePyTorchSymmMemBackend(backend); + const std::string group_name = initSymmMemBackendAndGetGroup(backend); std::vector strides(sizes.size()); strides.back() = 1; for (int64_t i = (int64_t)strides.size() - 2; i >= 0; --i) { @@ -184,16 +176,14 @@ at::Tensor SymmetricTensor::allocate( alloc_group_name, /*alloc_id=*/c10::nullopt); } -#else - if (backend != SymmetricMemoryBackend::Native) { - NVF_ERROR( - false, - "PyTorch symmetric memory backend requires a build with " - "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use " - "NVFUSER_ENABLE=symmetric_memory_backend(native) " - "or do not set symmetric_memory_backend."); - } -#endif + // #else + // NVF_ERROR( + // (backend == SymmetricMemoryBackend::Native), + // "PyTorch symmetric memory backend requires a build with " + // "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use " + // "NVFUSER_ENABLE=symmetric_memory_backend(native) " + // "or do not set symmetric_memory_backend."); + // #endif int is_vmm_supported = 0; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( @@ -236,7 +226,7 @@ at::Tensor SymmetricTensor::allocate( std::vector strides(sizes.size()); strides.back() = 1; - for (int64_t i = static_cast(strides.size()) - 2; i >= 0; --i) { + for (int64_t i = std::ssize(strides) - 2; i >= 0; --i) { strides[i] = strides[i + 1] * sizes[i + 1]; } @@ -325,7 +315,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) "Expected CUDA tensor, got: ", local_tensor.device()); -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { Communicator& comm = Communicator::getInstance(); @@ -334,7 +324,7 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) requested_size_ = local_tensor.numel() * local_tensor.element_size(); return; } -#endif + // #endif std::string error = SymmetricTensor::validate(local_tensor); NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error); @@ -377,11 +367,11 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) } SymmetricTensor::~SymmetricTensor() { -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return; // PyTorch backend: no native VMM cleanup } -#endif + // #endif #if (CUDA_VERSION >= 13000) if (is_multicast_setup_) { if (mc_base_ptr_) { @@ -433,12 +423,12 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { return; } Communicator& comm = Communicator::getInstance(); -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) // PyTorch backend: perform rendezvous here (lazy, on first // setupRemoteHandles). SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { - const std::string group_name = ensurePyTorchSymmMemBackend(backend); + const std::string group_name = initSymmMemBackendAndGetGroup(backend); comm.barrier(CommunicatorBackend::kNccl); torch_symm_handle_ = c10d::symmetric_memory::rendezvous(local_tensor_, group_name); @@ -449,7 +439,7 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { } return; } -#endif + // #endif CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_]; CUdeviceptr local_ptr = remote_ptrs_[my_device_id_]; @@ -527,14 +517,14 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { return local_tensor_; } -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( static_cast(rank), local_tensor_.sizes(), local_tensor_.scalar_type()); } -#endif + // #endif NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup"); return at::from_blob( @@ -556,14 +546,14 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { if (is_contiguous_view_setup_) { return; } -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " "backend." "Use native backend for SymmetricContiguousView."); } -#endif + // #endif NVF_CHECK( are_remote_tensors_setup_ == true, "Remote tensors must be setup before setupContiguousView"); @@ -629,13 +619,13 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { } at::Tensor SymmetricTensor::getContiguousView() const { -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " "backend."); } -#endif + // #endif NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup"); return contiguous_view_; } @@ -647,7 +637,7 @@ void SymmetricTensor::setupMulticast( if (is_multicast_setup_) { return; } -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { if (!torch_symm_handle_) { setupRemoteHandles(tag); @@ -657,7 +647,7 @@ void SymmetricTensor::setupMulticast( } return; } -#endif + // #endif Communicator& comm = Communicator::getInstance(); const int64_t my_rank = comm.deviceId(); const int64_t local_rank = comm.local_rank(); diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 92af0d4f5f6..888472f2b46 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -85,10 +85,10 @@ class SymmetricTensor { int peer_fd_{-1}; bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) c10::intrusive_ptr torch_symm_handle_; -#endif + // #endif }; } // namespace nvfuser From 6a5d3c3aa9eef9afff785c34fa2914dd25b7982d Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 10 Apr 2026 15:18:15 +0300 Subject: [PATCH 14/16] Fix missing guard for process_groups --- csrc/multidevice/communicator.h | 4 ++-- csrc/multidevice/symmetric_tensor.cpp | 15 +++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 72017dde4a1..9fbbde89a4e 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -161,10 +161,10 @@ class NVF_API Communicator { // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; // c10d process-group wrappers registered for symmetric-memory rendezvous. -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) std::unordered_map> process_groups_; -#endif + // #endif }; } // namespace nvfuser diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index e1f2204fc5a..e262a7a89c9 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -745,21 +745,20 @@ void SymmetricTensor::setupMulticast( aligned_size_, 0)); - CUdeviceptr multicast_ptr_ = 0; + CUdeviceptr mc_ptr = 0; NVFUSER_CUDA_SAFE_CALL( - cuMemAddressReserve(&multicast_ptr_, aligned_size_, granularity_, 0, 0)); - NVFUSER_CUDA_SAFE_CALL( - cuMemMap(multicast_ptr_, aligned_size_, 0, mcast_handle_, 0)); + cuMemAddressReserve(&mc_ptr, aligned_size_, granularity_, 0, 0)); + NVFUSER_CUDA_SAFE_CALL(cuMemMap(mc_ptr, aligned_size_, 0, mcast_handle_, 0)); CUmemAccessDesc access{}; access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; access.location.id = static_cast(local_rank); access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - NVFUSER_CUDA_SAFE_CALL( - cuMemSetAccess(multicast_ptr_, aligned_size_, &access, 1)); + NVFUSER_CUDA_SAFE_CALL(cuMemSetAccess(mc_ptr, aligned_size_, &access, 1)); - multicast_ptr_ = multicast_ptr_ + offset_diff; - mc_base_ptr_ = multicast_ptr_; + // NOLINTNEXTLINE(performance-no-int-to-ptr) + multicast_ptr_ = reinterpret_cast(mc_ptr + offset_diff); + mc_base_ptr_ = mc_ptr; is_multicast_setup_ = true; comm.barrier(); From 2908e70f74ac191865923111841b68ca5083b31e Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 10 Apr 2026 16:05:08 +0300 Subject: [PATCH 15/16] include mock header for non distributed build --- csrc/multidevice/symmetric_tensor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index 3ddc5955a0c..c323b045c9c 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -12,6 +12,8 @@ #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) #include +#else +#include "multidevice/c10d_mock.h" #endif namespace nvfuser { From aa4f9c1628c1a201e38dd7203b698fa56d27071c Mon Sep 17 00:00:00 2001 From: saivishal1999 Date: Fri, 10 Apr 2026 18:04:17 +0300 Subject: [PATCH 16/16] Remove guard comments --- csrc/multidevice/c10d_mock.h | 14 +++++++----- csrc/multidevice/communicator.cpp | 15 +------------ csrc/multidevice/communicator.h | 7 ++---- csrc/multidevice/symmetric_tensor.cpp | 32 +-------------------------- csrc/multidevice/symmetric_tensor.h | 3 +-- 5 files changed, 13 insertions(+), 58 deletions(-) diff --git a/csrc/multidevice/c10d_mock.h b/csrc/multidevice/c10d_mock.h index 7a392158ace..bc51fe02dfb 100644 --- a/csrc/multidevice/c10d_mock.h +++ b/csrc/multidevice/c10d_mock.h @@ -220,13 +220,15 @@ class ProcessGroup : public torch::CustomClassHolder { }; inline c10::intrusive_ptr resolve_process_group( - const std::string&) { + const std::string& group_name) { return c10::make_intrusive(); } inline void register_process_group( - const std::string&, - const c10::intrusive_ptr&) {} + const std::string& group_name, + const c10::intrusive_ptr& group) {} + +inline void unregister_process_group(const std::string& group_name) {} } // namespace c10d @@ -257,13 +259,13 @@ inline at::Tensor empty_strided_p2p( c10::ScalarType dtype, c10::Device device, const std::optional& group_name, - std::optional) { + std::optional alloc_id) { return at::empty(size, at::TensorOptions().dtype(dtype)); } inline c10::intrusive_ptr rendezvous( - const at::Tensor&, - const std::optional& = std::nullopt) { + const at::Tensor& tensor, + const std::optional& group_name = std::nullopt) { return c10::make_intrusive(); } diff --git a/csrc/multidevice/communicator.cpp b/csrc/multidevice/communicator.cpp index ad953c1cffa..afe2ff1308d 100644 --- a/csrc/multidevice/communicator.cpp +++ b/csrc/multidevice/communicator.cpp @@ -16,7 +16,6 @@ #ifdef NVFUSER_DISTRIBUTED #include #include -#include #include #ifdef USE_C10D_NCCL #include @@ -334,14 +333,12 @@ Communicator& Communicator::getInstance() { return *communicator; } -// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) void Communicator::registerProcessGroup( const std::string& name, const c10::intrusive_ptr& pg) { c10d::register_process_group(name, pg); process_groups_[name] = pg; } -// #endif void Communicator::cleanup() { static bool cleaned_up = false; @@ -376,12 +373,10 @@ void Communicator::cleanup() { } } #endif -#if defined(USE_DISTRIBUTED) for (const auto& entry : process_groups_) { c10d::unregister_process_group(entry.first); } process_groups_.clear(); -#endif #endif backends_.clear(); } @@ -402,6 +397,7 @@ c10d::Backend* Communicator::getBackendForTeam( // generate a string key which is unique to the team // create the team and cache it std::string team_key = prefix + getTeamKey(team, b); + // check that the caller's rank belongs to the requested team auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); if (rank_it == team.end()) { return nullptr; @@ -411,11 +407,6 @@ c10d::Backend* Communicator::getBackendForTeam( backends_.end()) { // create the backend and cache it #ifdef NVFUSER_DISTRIBUTED backends_[team_key] = [&]() -> c10::intrusive_ptr { - // check that the caller's rank belongs to the requested team - // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); - // if (rank_it == team.end()) { - // return nullptr; - // } // retrieve the caller's rank index/position in the team RankType team_rank = std::distance(team.begin(), rank_it); return createBackend( @@ -431,10 +422,6 @@ c10d::Backend* Communicator::getBackendForTeam( #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (process_groups_.find(team_key) == process_groups_.end()) { if (b == CommunicatorBackend::kNccl) { - // auto rank_it = std::ranges::find(team.begin(), team.end(), deviceId()); - // if (rank_it == team.end()) { - // return nullptr; - // } RankType team_rank = std::distance(team.begin(), rank_it); auto pg = c10::make_intrusive( diff --git a/csrc/multidevice/communicator.h b/csrc/multidevice/communicator.h index 9fbbde89a4e..f19707334b2 100644 --- a/csrc/multidevice/communicator.h +++ b/csrc/multidevice/communicator.h @@ -11,7 +11,7 @@ #include #include -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) +#if defined(NVFUSER_DISTRIBUTED) #include #include #include @@ -111,11 +111,9 @@ class NVF_API Communicator { c10d::Backend* getWorld( std::optional backend = std::nullopt); - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) void registerProcessGroup( const std::string& name, const c10::intrusive_ptr& pg); - // #endif // returns if a backend is available for creation bool isBackendAvailable(CommunicatorBackend backend) const { @@ -161,10 +159,9 @@ class NVF_API Communicator { // cache for the created backends. The keys are strings generated from Teams std::unordered_map> backends_; // c10d process-group wrappers registered for symmetric-memory rendezvous. - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) + // Keeps track of the process groups created for the rendezvous. std::unordered_map> process_groups_; - // #endif }; } // namespace nvfuser diff --git a/csrc/multidevice/symmetric_tensor.cpp b/csrc/multidevice/symmetric_tensor.cpp index e262a7a89c9..862a40045e3 100644 --- a/csrc/multidevice/symmetric_tensor.cpp +++ b/csrc/multidevice/symmetric_tensor.cpp @@ -15,16 +15,10 @@ #include "multidevice/ipc_utils.h" #include "multidevice/utils.h" -#if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) -#include -#include -#endif - namespace nvfuser { namespace { -// #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) { static std::once_flag once; std::call_once(once, [backend]() { @@ -53,7 +47,7 @@ std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) { NVF_CHECK( comm.isBackendAvailable(CommunicatorBackend::kNccl), - "NCCL backend is required for non-native symmetric memory backend"); + "kNccl backend is required for non-native symmetric memory backend"); std::vector all_ranks(comm.size()); std::iota(all_ranks.begin(), all_ranks.end(), 0); @@ -81,7 +75,6 @@ std::string initSymmMemBackendAndGetGroup(SymmetricMemoryBackend backend) { return group_name; } -// #endif // Returns the allocation granularity for symmetric memory. // - query_mcast_granularity: if true, considers multicast granularity @@ -154,7 +147,6 @@ at::Tensor SymmetricTensor::allocate( at::Device device) { SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (backend != SymmetricMemoryBackend::Native) { const std::string group_name = initSymmMemBackendAndGetGroup(backend); std::vector strides(sizes.size()); @@ -176,14 +168,6 @@ at::Tensor SymmetricTensor::allocate( alloc_group_name, /*alloc_id=*/c10::nullopt); } - // #else - // NVF_ERROR( - // (backend == SymmetricMemoryBackend::Native), - // "PyTorch symmetric memory backend requires a build with " - // "NVFUSER_DISTRIBUTED and USE_DISTRIBUTED. Use " - // "NVFUSER_ENABLE=symmetric_memory_backend(native) " - // "or do not set symmetric_memory_backend."); - // #endif int is_vmm_supported = 0; NVFUSER_CUDA_SAFE_CALL(cuDeviceGetAttribute( @@ -314,7 +298,6 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) "Expected CUDA tensor, got: ", local_tensor.device()); - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); if (backend != SymmetricMemoryBackend::Native) { Communicator& comm = Communicator::getInstance(); @@ -323,7 +306,6 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) requested_size_ = local_tensor.numel() * local_tensor.element_size(); return; } - // #endif std::string error = SymmetricTensor::validate(local_tensor); NVF_CHECK(error.empty(), "Invalid symmetric allocation: ", error); @@ -367,11 +349,9 @@ SymmetricTensor::SymmetricTensor(const at::Tensor& local_tensor) } SymmetricTensor::~SymmetricTensor() { - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return; // PyTorch backend: no native VMM cleanup } - // #endif #if (CUDA_VERSION >= 13000) if (is_multicast_setup_) { if (mc_base_ptr_) { @@ -423,7 +403,6 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { return; } Communicator& comm = Communicator::getInstance(); - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) // PyTorch backend: perform rendezvous here (lazy, on first // setupRemoteHandles). SymmetricMemoryBackend backend = getSymmetricMemoryBackend(); @@ -439,7 +418,6 @@ void SymmetricTensor::setupRemoteHandles(const std::string& tag) { } return; } - // #endif CUmemGenericAllocationHandle local_handle = alloc_handles_[my_device_id_]; CUdeviceptr local_ptr = remote_ptrs_[my_device_id_]; @@ -533,14 +511,12 @@ at::Tensor SymmetricTensor::remoteTensor(int64_t rank) const { return local_tensor_; } - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { return torch_symm_handle_->get_remote_tensor( static_cast(rank), local_tensor_.sizes(), local_tensor_.scalar_type()); } - // #endif NVF_CHECK(are_remote_tensors_setup_ == true, "Remote tensors not setup"); return at::from_blob( @@ -562,14 +538,12 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { if (is_contiguous_view_setup_) { return; } - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " "backend." "Use native backend for SymmetricContiguousView."); } - // #endif NVF_CHECK( are_remote_tensors_setup_ == true, "Remote tensors must be setup before setupContiguousView"); @@ -634,13 +608,11 @@ void SymmetricTensor::setupContiguousView(const std::string& tag) { } at::Tensor SymmetricTensor::getContiguousView() const { - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (torch_symm_handle_) { NVF_THROW( "Contiguous view is not yet supported for PyTorch symmetric memory " "backend."); } - // #endif NVF_CHECK(is_contiguous_view_setup_, "Contiguous view not setup"); return contiguous_view_; } @@ -652,7 +624,6 @@ void SymmetricTensor::setupMulticast( if (is_multicast_setup_) { return; } - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::Native) { if (!torch_symm_handle_) { setupRemoteHandles(tag); @@ -662,7 +633,6 @@ void SymmetricTensor::setupMulticast( } return; } - // #endif Communicator& comm = Communicator::getInstance(); const int64_t my_rank = comm.deviceId(); const int64_t local_rank = comm.local_rank(); diff --git a/csrc/multidevice/symmetric_tensor.h b/csrc/multidevice/symmetric_tensor.h index c323b045c9c..ae10cb3dcbc 100644 --- a/csrc/multidevice/symmetric_tensor.h +++ b/csrc/multidevice/symmetric_tensor.h @@ -11,6 +11,7 @@ #include #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) +#include #include #else #include "multidevice/c10d_mock.h" @@ -94,10 +95,8 @@ class SymmetricTensor { int peer_fd_{-1}; bool is_contiguous_view_setup_ = false; at::Tensor contiguous_view_; - // #if defined(NVFUSER_DISTRIBUTED) && defined(USE_DISTRIBUTED) c10::intrusive_ptr torch_symm_handle_; - // #endif }; } // namespace nvfuser