Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
febdbf9
WIP; need amd fix
chhwang Feb 21, 2026
54e46ba
rocm fix wip
chhwang Feb 23, 2026
98b023a
rocm fixes
chhwang Feb 24, 2026
22e5efb
gdrcopy install in container
chhwang Feb 24, 2026
25f31b4
updates
chhwang Feb 24, 2026
75dfdd9
Merge branch 'main' into chhwang/fix-ib-no-atomic
chhwang Feb 24, 2026
ac4d713
updates
chhwang Feb 24, 2026
ac022c3
a few updates
chhwang Feb 25, 2026
72407af
License
chhwang Feb 25, 2026
8effd97
License
chhwang Feb 25, 2026
fd7358d
License, lint
chhwang Feb 25, 2026
67d1706
optimized recv loop
chhwang Feb 26, 2026
060982d
updates
chhwang Feb 26, 2026
6b2f819
Merge branch 'main' into chhwang/fix-ib-no-atomic
chhwang Feb 26, 2026
3b56b08
data direct
chhwang Mar 4, 2026
448ceb6
updates
chhwang Mar 5, 2026
7ce841b
Updates
chhwang Mar 5, 2026
6bbb042
debug
Binyang2014 Mar 6, 2026
4892b4e
fix
Binyang2014 Mar 8, 2026
5d9f761
debug
Binyang2014 Mar 9, 2026
3efb1fd
update
Binyang2014 Mar 9, 2026
2478553
Unique QP per channel and env-controlled GID index
Binyang2014 Mar 9, 2026
ce9bada
update
Binyang2014 Mar 9, 2026
57af391
update
Binyang2014 Mar 9, 2026
1cc8422
update
Binyang2014 Mar 10, 2026
42d9845
update
Binyang2014 Mar 12, 2026
c84c2ed
update the number of instances
mahdiehghazim Mar 17, 2026
8dc63fa
re-format output
mahdiehghazim Mar 17, 2026
0f38ab5
add scripts
mahdiehghazim Mar 17, 2026
51416d6
update scripts
mahdiehghazim Mar 17, 2026
c919b96
show scale in output
mahdiehghazim Mar 17, 2026
9dd47b3
update the executor so we have message size range
mahdiehghazim Mar 17, 2026
fc3f4b9
tune #instances and remoce extra barriers
mahdiehghazim Mar 19, 2026
8f6df17
add sendrecv correctness check
mahdiehghazim Apr 3, 2026
a83f0db
add debugging code
mahdiehghazim Apr 3, 2026
455172b
merge main
Binyang2014 Apr 9, 2026
92a5224
Add MSCCLPP_IB_GID_INDEX env variable for IB transport
Binyang2014 Apr 9, 2026
a5fd7e4
Add MSCCLPP_IB_GID_INDEX env and wire up to endpoint QP creation
Binyang2014 Apr 9, 2026
b61e0a7
WIP
Binyang2014 Apr 9, 2026
b2d47ff
Expose ib_gid_index in Python env binding
Binyang2014 Apr 9, 2026
2900726
Merge branch 'main' into binyli/ib-no-atomic-test
Binyang2014 Apr 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/mscclpp/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ struct EndpointConfig {
};

static constexpr int DefaultPort = -1;
static constexpr int DefaultGidIndex = 0;
static constexpr int DefaultGidIndex = -1;
static constexpr int DefaultMaxCqSize = 1024;
Comment thread
Binyang2014 marked this conversation as resolved.
static constexpr int DefaultMaxCqPollNum = 1;
static constexpr int DefaultMaxSendWr = 8192;
Expand Down Expand Up @@ -418,7 +418,7 @@ struct EndpointConfig {
/// Constructor.
/// @param deviceIndex Device index.
/// @param port Port number.
/// @param gidIndex GID index.
/// @param gidIndex GID index. If -1 (default), uses `MSCCLPP_IB_GID_INDEX` env variable.
/// @param maxCqSize Maximum send completion queue size.
/// @param maxCqPollNum Maximum send completion queue poll count.
/// @param maxSendWr Maximum outstanding send work requests.
Expand Down
4 changes: 4 additions & 0 deletions include/mscclpp/env.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ class Env {
/// Default is false.
const bool forceDisableGdr;

/// Env name: `MSCCLPP_IB_GID_INDEX`. The GID index to use for IB transport.
/// Default is 0. Used when `EndpointConfig::Ib::gidIndex` is -1 (unspecified).
const int ibGidIndex;

private:
Env();

Expand Down
3 changes: 2 additions & 1 deletion python/csrc/env_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ void register_env(nb::module_& m) {
.def_ro("ibv_mode", &Env::ibvMode)
.def_ro("cache_dir", &Env::cacheDir)
.def_ro("npkit_dump_dir", &Env::npkitDumpDir)
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream);
.def_ro("cuda_ipc_use_default_stream", &Env::cudaIpcUseDefaultStream)
.def_ro("ib_gid_index", &Env::ibGidIndex);

m.def("env", &env);
}
5 changes: 5 additions & 0 deletions src/core/endpoint.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ Endpoint::Impl::Impl(const EndpointConfig& config, Context::Impl& contextImpl)
}
}

// Resolve GID index: explicit value (>= 0) takes priority, otherwise use env
if (config_.ib.gidIndex < 0) {
config_.ib.gidIndex = env()->ibGidIndex;
}
Comment thread
Binyang2014 marked this conversation as resolved.

int maxRecvWr = ibNoAtomic_ ? config_.ib.maxRecvWr : 0;

ibQp_ = contextImpl.getIbContext(config_.transport)
Expand Down
4 changes: 3 additions & 1 deletion src/core/env.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ Env::Env()
forceNcclFallbackOperation(readEnv<std::string>("MSCCLPP_FORCE_NCCL_FALLBACK_OPERATION", "")),
ncclSymmetricMemory(readEnv<bool>("MSCCLPP_NCCL_SYMMETRIC_MEMORY", false)),
forceDisableNvls(readEnv<bool>("MSCCLPP_FORCE_DISABLE_NVLS", false)),
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)) {}
forceDisableGdr(readEnv<bool>("MSCCLPP_FORCE_DISABLE_GDR", false)),
ibGidIndex(readEnv<int>("MSCCLPP_IB_GID_INDEX", 0)) {}

std::shared_ptr<Env> env() {
static std::shared_ptr<Env> globalEnv = std::shared_ptr<Env>(new Env());
Expand Down Expand Up @@ -95,6 +96,7 @@ std::shared_ptr<Env> env() {
logEnv("MSCCLPP_NCCL_SYMMETRIC_MEMORY", globalEnv->ncclSymmetricMemory);
logEnv("MSCCLPP_FORCE_DISABLE_NVLS", globalEnv->forceDisableNvls);
logEnv("MSCCLPP_FORCE_DISABLE_GDR", globalEnv->forceDisableGdr);
logEnv("MSCCLPP_IB_GID_INDEX", globalEnv->ibGidIndex);
}
return globalEnv;
}
Expand Down
Loading