-
Notifications
You must be signed in to change notification settings - Fork 79
multidevice: enable NCCL Copy Engine allgather via CTAPolicy=ZERO #6046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -5,7 +5,10 @@ | |||||||||||
| * SPDX-License-Identifier: BSD-3-Clause | ||||||||||||
| */ | ||||||||||||
| // clang-format on | ||||||||||||
| #include <numeric> | ||||||||||||
|
|
||||||||||||
| #include "multidevice/cuda_p2p.h" | ||||||||||||
| #include "multidevice/communicator.h" | ||||||||||||
| #include "multidevice/ipc_utils.h" | ||||||||||||
| #include "multidevice/symmetric_tensor.h" | ||||||||||||
| #include "tests/cpp/multidevice.h" | ||||||||||||
|
|
@@ -385,4 +388,63 @@ TEST_F(SymmetricTensorTest, SmallAllocationMulticast) { | |||||||||||
| #endif | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| // Verifies that allgather over symm_mem-allocated tensors produces correct | ||||||||||||
| // results when the NCCL PG is configured with CTAPolicy=ZERO (CE path). | ||||||||||||
| // Run with NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl). | ||||||||||||
| TEST_F(SymmetricTensorTest, CopyEngineAllgather) { | ||||||||||||
| if (getSymmetricMemoryBackend() != SymmetricMemoryBackend::PyTorchNccl) { | ||||||||||||
| GTEST_SKIP() | ||||||||||||
| << "Test requires NVFUSER_ENABLE=symmetric_memory_backend(pytorch_nccl)"; | ||||||||||||
| } | ||||||||||||
| if (communicator_->size() == 1) { | ||||||||||||
| GTEST_SKIP() << "Skipping single-device run"; | ||||||||||||
| } | ||||||||||||
| if (!communicator_->isBackendAvailable(CommunicatorBackend::kNccl)) { | ||||||||||||
| GTEST_SKIP() << "NCCL backend not available"; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| const int64_t rank = communicator_->deviceId(); | ||||||||||||
| const int64_t world_size = communicator_->size(); | ||||||||||||
| // 4MB per rank — large enough that CE scheduling overhead is worthwhile. | ||||||||||||
| constexpr int64_t kNumElems = 1024 * 1024; | ||||||||||||
|
|
||||||||||||
| // Allocate via empty_strided_p2p so NCCL can window-register the buffers, | ||||||||||||
| // which is required for the Copy Engine collective path. | ||||||||||||
| at::Tensor input = SymmetricTensor::allocate( | ||||||||||||
| {kNumElems}, at::ScalarType::Float, communicator_->device()); | ||||||||||||
| at::Tensor output = SymmetricTensor::allocate( | ||||||||||||
| {world_size * kNumElems}, at::ScalarType::Float, communicator_->device()); | ||||||||||||
|
|
||||||||||||
| // setupRemoteHandles triggers c10d::symmetric_memory::rendezvous, which | ||||||||||||
| // performs the NCCL window registration on both buffers. | ||||||||||||
| SymmetricTensor input_sym(input); | ||||||||||||
| SymmetricTensor output_sym(output); | ||||||||||||
| input_sym.setupRemoteHandles(); | ||||||||||||
| output_sym.setupRemoteHandles(); | ||||||||||||
|
|
||||||||||||
| // Each rank fills its input with a unique value (rank+1). | ||||||||||||
| input.fill_(static_cast<float>(rank + 1)); | ||||||||||||
|
|
||||||||||||
| // getBackendForTeam returns the NCCL PG created with CTAPolicy=ZERO by our | ||||||||||||
| // change, so _allgather_base will use the Copy Engine when both conditions | ||||||||||||
| // (CTAPolicy=ZERO + window-registered buffers) are met. | ||||||||||||
| Team all_ranks(world_size); | ||||||||||||
| std::iota(all_ranks.begin(), all_ranks.end(), 0); | ||||||||||||
| c10d::Backend* backend = | ||||||||||||
| communicator_->getBackendForTeam(all_ranks, CommunicatorBackend::kNccl); | ||||||||||||
|
Comment on lines
+431
to
+434
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time! |
||||||||||||
| ASSERT_NE(backend, nullptr); | ||||||||||||
|
|
||||||||||||
| auto work = backend->_allgather_base(output, input, {}); | ||||||||||||
| work->wait(); | ||||||||||||
|
|
||||||||||||
| // Validate: gathered slice for rank r must equal r+1 on every rank. | ||||||||||||
| at::Tensor output_cpu = output.cpu(); | ||||||||||||
| for (int64_t r = 0; r < world_size; ++r) { | ||||||||||||
| at::Tensor slice = | ||||||||||||
| output_cpu.slice(0, r * kNumElems, (r + 1) * kNumElems); | ||||||||||||
| EXPECT_TRUE(slice.eq(static_cast<float>(r + 1)).all().item<bool>()) | ||||||||||||
| << "Rank " << rank << ": allgather mismatch for source rank " << r; | ||||||||||||
| } | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| } // namespace nvfuser | ||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
createBackendis called lazily bygetBackendForTeamwhenever any new team key is created — including sub-team PGs (e.g., tensor-parallel or pipeline-parallel groups). WhenPyTorchNcclis active, every future NCCL process group will be created withCTAPolicy=ZERO, not only the world communicator used for the symmetric-memory allgather. Operations on those sub-team groups (allreduce, reduce-scatter, broadcast, etc.) that cannot use the CE path will be executed under the ZERO-CTA policy, which may meaningfully change their latency/throughput relative to the default policy.