Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
de8114e
update miltinode-pool
Binyang2014 Apr 8, 2026
77a30a8
update
Binyang2014 Apr 9, 2026
152eb4d
Remove hardcoded VMSS hostnames from deploy files
Binyang2014 Apr 9, 2026
7a0cf0d
Fix multi-node H100 CI: drop cuda11.8, add gpuArch, improve arch dete…
Binyang2014 Apr 9, 2026
545d367
Use cuda13.0 image for multi-node H100 CI
Binyang2014 Apr 9, 2026
6ca257d
testing
Binyang2014 Apr 9, 2026
a2ef206
fix CI
Binyang2014 Apr 9, 2026
d531e4d
Speed up deploy by archiving before scp
Binyang2014 Apr 10, 2026
98af765
Merge branch 'main' into binyli/multinode-ci
Binyang2014 Apr 10, 2026
db469b6
debug
Binyang2014 Apr 10, 2026
5926535
Fix tar extraction path in deploy.sh
Binyang2014 Apr 10, 2026
7f14ca2
update
Binyang2014 Apr 10, 2026
f8f8aff
Fix test binary paths: build/test/ -> build/bin/
Binyang2014 Apr 10, 2026
8c096b4
Add eth0 MPI TCP interface and deduplicate mpirun args
Binyang2014 Apr 10, 2026
22a20a4
Set MSCCLPP_SOCKET_IFNAME=eth0 for multi-node bootstrap
Binyang2014 Apr 10, 2026
724f889
Fix cross-node CudaIpc crash when Fabric/IMEX unavailable
Binyang2014 Apr 10, 2026
453e0ed
Generate SSH config dynamically from hostfile_mpi
Binyang2014 Apr 10, 2026
f138b13
Select perf baseline based on GPU type (H100 -> ndmv5)
Binyang2014 Apr 10, 2026
0ddd37f
Add H100 multi-node perf baselines to ndmv5
Binyang2014 Apr 10, 2026
5ad154a
Use eth0 IP for mp_unit_tests bootstrap endpoint
Binyang2014 Apr 10, 2026
50da168
Revert peer-access-test Makefile to use -arch=native
Binyang2014 Apr 10, 2026
feee30f
Allow RunMscclppTest to fail without blocking pipeline
Binyang2014 Apr 10, 2026
ff8d4b3
Reorder CudaIpc branch to check same-host before cross-node
Binyang2014 Apr 10, 2026
43ba04a
Address PR review comments for multi-node CI
Binyang2014 Apr 10, 2026
bf4f099
Merge branch 'main' into binyli/multinode-ci
Binyang2014 Apr 10, 2026
6dd4e5b
Install GDRCopy 2.5.2 kernel module on host VMs during deploy
Binyang2014 Apr 10, 2026
62b48a5
Refactor CudaIpc import to remove redundant patterns and clarify fall…
Binyang2014 Apr 13, 2026
4eac8d8
Merge branch 'main' into binyli/multinode-ci
Binyang2014 Apr 13, 2026
a2bcc15
Simplify CudaIpc fallback: remove hasFabric check, add WARN for same-…
Binyang2014 Apr 13, 2026
0b6f893
Merge branch 'main' into binyli/multinode-ci
Binyang2014 Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 51 additions & 22 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,24 @@ pr: none


parameters:
- name: vmssName
type: string
default: mscclpp-h100-multinode-ci
- name: hostEntries
type: string
default: |
10.0.0.10 mscclit-000000
10.0.0.11 mscclit-000001
10.0.0.5 mscclpp-h100-multinode-ci000000
10.0.0.4 mscclpp-h100-multinode-ci000001

jobs:
- job: MultiNodesTest
displayName: Multi nodes test
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
pool:
name: mscclpp-it
name: mscclpp-multi-node
container:
image: $[ variables['containerImage'] ]

Expand All @@ -42,54 +43,82 @@ jobs:
inputs:
targetType: 'inline'
script: |
ENTRY="${{ parameters.hostEntries }}"
if ! grep -qxF "$ENTRY" /etc/hosts; then
echo "Adding to /etc/hosts"
echo "$ENTRY" | sudo tee -a /etc/hosts
else
echo "Entry already exists, nothing to do."
fi
while IFS= read -r line; do
[ -z "$line" ] && continue
if ! grep -qxF "$line" /etc/hosts; then
echo "Adding to /etc/hosts: $line"
echo "$line" | sudo tee -a /etc/hosts
else
echo "Entry already exists: $line"
fi
done <<< "${{ parameters.hostEntries }}"

- task: Bash@3
displayName: Generate deploy files
inputs:
targetType: 'inline'
script: |
set -e
VMSS="${{ parameters.vmssName }}"
DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
NODE0="${VMSS}000000"
NODE1="${VMSS}000001"

echo "Host ${NODE0}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no
Host ${NODE1}
Port 22345
IdentityFile /root/mscclpp/sshkey
StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"

printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"

printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"

- template: templates/deploy.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
gpuArch: '90'

- template: templates/run-remote-task.yml
parameters:
name: RunMscclppTest
displayName: Run multi-nodes mscclpp-test
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
continueOnError: true
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test

- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodeUnitTest
displayName: Run multi-nodes unit tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh mp-ut

- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonTests
displayName: Run multi-nodes python tests
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh pytests

- template: templates/run-remote-task.yml
parameters:
name: RunMultiNodePythonBenchmark
displayName: Run multi-nodes python benchmark
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
remoteScript: |
bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark

- template: templates/stop.yml
parameters:
subscription: msccl-it
vmssName: mscclit-vmss
resourceGroup: msccl-IT
subscription: mscclpp-ci-h100
vmssName: ${{ parameters.vmssName }}
resourceGroup: mscclpp
4 changes: 4 additions & 0 deletions .azure-pipelines/templates/run-remote-task.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ parameters:
- name: workingDirectory
type: string
default: '$(System.DefaultWorkingDirectory)'
- name: continueOnError
type: boolean
default: false

steps:
- task: Bash@3
${{ if ne(parameters.name, '') }}:
name: ${{ parameters.name }}
displayName: ${{ parameters.displayName }}
continueOnError: ${{ parameters.continueOnError }}
inputs:
targetType: 'inline'
script: |
Expand Down
5 changes: 0 additions & 5 deletions docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@ baseImageTable=(

declare -A extraLdPathTable
extraLdPathTable=(
Comment thread
Binyang2014 marked this conversation as resolved.
["cuda11.8"]="/usr/local/cuda-11.8/compat"
["cuda12.4"]="/usr/local/cuda-12.4/compat"
["cuda12.8"]="/usr/local/cuda-12.8/compat"
["cuda12.9"]="/usr/local/cuda-12.9/compat"
["cuda13.0"]="/usr/local/cuda-13.0/compat"
["rocm6.2"]="/opt/rocm/lib"
)

Expand Down
22 changes: 18 additions & 4 deletions src/core/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
}
}
} else if (transports.has(Transport::CudaIpc)) {
// When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
// try CudaIpc first and fall back to IB on failure.
auto entry = getTransportInfo(Transport::CudaIpc);
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
// Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
bool hasIB = (transports & AllIBTransports).any();
try {
auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
this->remoteMemMap = gpuIpcMem->map();
this->data = this->remoteMemMap.get();
} catch (const BaseError& e) {
if (!hasIB) {
throw;
}
bool isSameHost = (getHostHash() == this->hostHash);
if (isSameHost) {
WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
} else {
INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
}
}
}
if (this->data != nullptr) {
INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);
Expand Down
8 changes: 0 additions & 8 deletions test/deploy/config

This file was deleted.

24 changes: 23 additions & 1 deletion test/deploy/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,34 @@ done

set -e
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
rm -f /tmp/mscclpp.tar.gz

if [ "${PLATFORM}" == "rocm" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
fi

# Install GDRCopy kernel module on host VMs (CUDA only)
GDRCOPY_VERSION="2.5.2"
if [ "${PLATFORM}" == "cuda" ]; then
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"if lsmod | grep -q gdrdrv; then
echo 'gdrdrv module already loaded'
else
set -e
sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
CUDA=/usr/local/cuda ./build-deb-packages.sh
sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
sudo modprobe gdrdrv
rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
fi"
fi

# force to pull the latest image
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"
Expand Down
2 changes: 0 additions & 2 deletions test/deploy/hostfile

This file was deleted.

2 changes: 0 additions & 2 deletions test/deploy/hostfile_mpi

This file was deleted.

9 changes: 8 additions & 1 deletion test/deploy/perf_ndmv5.jsonl
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.98, "busBw":6.96, "size":24576, "time":6.18, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":7.42, "busBw":12.99, "size":49152, "time":6.62, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":10.67, "busBw":18.68, "size":73728, "time":6.91, "target":"latency"}
{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":0.54, "busBw":1.01, "size":8192, "time":15.10, "target":"latency"}
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":118.49,"busBw":222.17, "size":25165824, "time":212.39, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":138.48,"busBw":259.65, "size":50331648, "time":363.40, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":96.94, "busBw":90.88, "size":1073741824, "time":11076.24,"target":"throughput"}
96 changes: 56 additions & 40 deletions test/deploy/run_tests.sh
Original file line number Diff line number Diff line change
@@ -1,83 +1,99 @@
set -e
HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
HEAD_HOST=$(head -1 ${HOSTFILE})
# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
if [ -z "${HEAD_IP}" ]; then
HEAD_IP=${HEAD_HOST}
fi
MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
Comment thread
Binyang2014 marked this conversation as resolved.

# Select perf baseline based on GPU type
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
if echo "${GPU_NAME}" | grep -qi "H100"; then
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
else
PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
fi

function run_mscclpp_test()
{
echo "=================Run allgather_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

# For kernel 2, the message size must can be divided by 3
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl

mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl

echo "==================Run allreduce_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl

mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl

mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl

mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl

echo "==================Run alltoall_test_perf on 2 nodes========================="
mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl

echo "========================Run performance check==============================="
python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
--baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
--baseline-file ${PERF_BASELINE}
}

function run_mp_ut()
{
echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
mpirun ${MPI_ARGS} -tag-output -np 2 \
${MSCCLPP_ENV} \
-npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003

echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
}

function run_pytests()
{
echo "==================Run python tests================================"
mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
mpirun ${MPI_ARGS} -tag-output -np 16 \
${MSCCLPP_ENV} \
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
}

function run_py_benchmark()
{
echo "==================Run python benchmark================================"
mpirun -allow-run-as-root -np 16 --bind-to numa \
-hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
mpirun ${MPI_ARGS} -np 16 \
${MSCCLPP_ENV} \
-mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
-x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
-x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
-x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
}

if [ $# -lt 1 ]; then
Expand Down
Loading
Loading