microsoft · Binyang2014 · Apr 14, 2026 · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -16,23 +16,24 @@ pr: none
 
 
 parameters:
+- name: vmssName
+  type: string
+  default: mscclpp-h100-multinode-ci
 - name: hostEntries
   type: string
   default: |
-    10.0.0.10 mscclit-000000
-    10.0.0.11 mscclit-000001
+    10.0.0.5 mscclpp-h100-multinode-ci000000
+    10.0.0.4 mscclpp-h100-multinode-ci000001
 
 jobs:
 - job: MultiNodesTest
   displayName: Multi nodes test
   strategy:
     matrix:
-      cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
   pool:
-    name: mscclpp-it
+    name: mscclpp-multi-node
   container:
     image: $[ variables['containerImage'] ]
 
@@ -42,54 +43,82 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        ENTRY="${{ parameters.hostEntries }}"
-        if ! grep -qxF "$ENTRY" /etc/hosts; then
-          echo "Adding to /etc/hosts"
-          echo "$ENTRY" | sudo tee -a /etc/hosts
-        else
-          echo "Entry already exists, nothing to do."
-        fi
+        while IFS= read -r line; do
+          [ -z "$line" ] && continue
+          if ! grep -qxF "$line" /etc/hosts; then
+            echo "Adding to /etc/hosts: $line"
+            echo "$line" | sudo tee -a /etc/hosts
+          else
+            echo "Entry already exists: $line"
+          fi
+        done <<< "${{ parameters.hostEntries }}"
+
+  - task: Bash@3
+    displayName: Generate deploy files
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -e
+        VMSS="${{ parameters.vmssName }}"
+        DEPLOY_DIR="$(System.DefaultWorkingDirectory)/test/deploy"
+        NODE0="${VMSS}000000"
+        NODE1="${VMSS}000001"
+
+        echo "Host ${NODE0}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no
+        Host ${NODE1}
+          Port 22345
+          IdentityFile /root/mscclpp/sshkey
+          StrictHostKeyChecking no" > "${DEPLOY_DIR}/config"
+
+        printf '%s\n%s\n' "azureuser@${NODE0}" "azureuser@${NODE1}" > "${DEPLOY_DIR}/hostfile"
+
+        printf '%s\n%s\n' "${NODE0}" "${NODE1}" > "${DEPLOY_DIR}/hostfile_mpi"
 
   - template: templates/deploy.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
+      gpuArch:       '90'
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMscclppTest
       displayName: Run multi-nodes mscclpp-test
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      continueOnError: true
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mscclpp-test
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodeUnitTest
       displayName: Run multi-nodes unit tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh mp-ut
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodePythonTests
       displayName: Run multi-nodes python tests
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh pytests
 
   - template: templates/run-remote-task.yml
     parameters:
       name: RunMultiNodePythonBenchmark
       displayName: Run multi-nodes python benchmark
-      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host mscclit-000000 --user azureuser'
+      runRemoteArgs: '--hostfile $(System.DefaultWorkingDirectory)/test/deploy/hostfile --host ${{ parameters.vmssName }}000000 --user azureuser'
       remoteScript: |
         bash /root/mscclpp/test/deploy/run_tests.sh py-benchmark
 
   - template: templates/stop.yml
     parameters:
-      subscription:  msccl-it
-      vmssName:      mscclit-vmss
-      resourceGroup: msccl-IT
+      subscription:  mscclpp-ci-h100
+      vmssName:      ${{ parameters.vmssName }}
+      resourceGroup: mscclpp
diff --git a/.azure-pipelines/templates/run-remote-task.yml b/.azure-pipelines/templates/run-remote-task.yml
@@ -12,12 +12,16 @@ parameters:
 - name: workingDirectory
   type: string
   default: '$(System.DefaultWorkingDirectory)'
+- name: continueOnError
+  type: boolean
+  default: false
 
 steps:
 - task: Bash@3
   ${{ if ne(parameters.name, '') }}:
     name: ${{ parameters.name }}
   displayName: ${{ parameters.displayName }}
+  continueOnError: ${{ parameters.continueOnError }}
   inputs:
     targetType: 'inline'
     script: |

diff --git a/docker/build.sh b/docker/build.sh
@@ -14,11 +14,6 @@ baseImageTable=(
 
 declare -A extraLdPathTable
 extraLdPathTable=(
-    ["cuda11.8"]="/usr/local/cuda-11.8/compat"
-    ["cuda12.4"]="/usr/local/cuda-12.4/compat"
-    ["cuda12.8"]="/usr/local/cuda-12.8/compat"
-    ["cuda12.9"]="/usr/local/cuda-12.9/compat"
-    ["cuda13.0"]="/usr/local/cuda-13.0/compat"
     ["rocm6.2"]="/opt/rocm/lib"
 )
 

diff --git a/src/core/registered_memory.cc b/src/core/registered_memory.cc
@@ -158,11 +158,25 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
       }
     }
   } else if (transports.has(Transport::CudaIpc)) {
+    // When transports include both CudaIpc and IB (e.g., CudaIpc | IB0),
+    // try CudaIpc first and fall back to IB on failure.
     auto entry = getTransportInfo(Transport::CudaIpc);
-    auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
-    // Create a memory map for the remote GPU memory. The memory map will keep the GpuIpcMem instance alive.
-    this->remoteMemMap = gpuIpcMem->map();
-    this->data = this->remoteMemMap.get();
+    bool hasIB = (transports & AllIBTransports).any();
+    try {
+      auto gpuIpcMem = GpuIpcMem::create(entry.gpuIpcMemHandle);
+      this->remoteMemMap = gpuIpcMem->map();
+      this->data = this->remoteMemMap.get();
+    } catch (const BaseError& e) {
+      if (!hasIB) {
+        throw;
+      }
+      bool isSameHost = (getHostHash() == this->hostHash);
+      if (isSameHost) {
+        WARN(GPU, "CudaIpc import failed on same host, falling back to IB transport: ", e.what());
+      } else {
+        INFO(GPU, "CudaIpc import failed on remote host, falling back to IB transport: ", e.what());
+      }
+    }
   }
   if (this->data != nullptr) {
     INFO(GPU, "Opened CUDA IPC handle at pointer ", this->data);

diff --git a/test/deploy/config b/test/deploy/config
diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
@@ -33,12 +33,34 @@ done
 
 set -e
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo rm -rf ${DST_DIR}"
-parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
+tar czf /tmp/mscclpp.tar.gz -C ${ROOT_DIR} .
+parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION /tmp/mscclpp.tar.gz /tmp/mscclpp.tar.gz
+parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+  "sudo mkdir -p ${DST_DIR} && sudo tar xzf /tmp/mscclpp.tar.gz -C ${DST_DIR} && sudo rm -f /tmp/mscclpp.tar.gz"
+rm -f /tmp/mscclpp.tar.gz
 
 if [ "${PLATFORM}" == "rocm" ]; then
   parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION "sudo modprobe amdgpu"
 fi
 
+# Install GDRCopy kernel module on host VMs (CUDA only)
+GDRCOPY_VERSION="2.5.2"
+if [ "${PLATFORM}" == "cuda" ]; then
+  parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
+    "if lsmod | grep -q gdrdrv; then
+      echo 'gdrdrv module already loaded'
+    else
+      set -e
+      sudo apt-get update -y && sudo apt-get install -y build-essential devscripts debhelper check libsubunit-dev fakeroot pkg-config dkms
+      cd /tmp && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz -O gdrcopy.tar.gz
+      tar xzf gdrcopy.tar.gz && cd gdrcopy-${GDRCOPY_VERSION}/packages
+      CUDA=/usr/local/cuda ./build-deb-packages.sh
+      sudo dpkg -i gdrdrv-dkms_${GDRCOPY_VERSION}*.deb
+      sudo modprobe gdrdrv
+      rm -rf /tmp/gdrcopy.tar.gz /tmp/gdrcopy-${GDRCOPY_VERSION}
+    fi"
+fi
+
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
   "sudo docker pull ${CONTAINERIMAGE}"

diff --git a/test/deploy/hostfile b/test/deploy/hostfile
diff --git a/test/deploy/hostfile_mpi b/test/deploy/hostfile_mpi
diff --git a/test/deploy/perf_ndmv5.jsonl b/test/deploy/perf_ndmv5.jsonl
@@ -1,3 +1,10 @@
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.98,  "busBw":6.96,   "size":24576,      "time":6.18,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":7.42,  "busBw":12.99,  "size":49152,      "time":6.62,    "target":"latency"}
-{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
+{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":10.67, "busBw":18.68,  "size":73728,      "time":6.91,    "target":"latency"}
+{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":430.62,"busBw":403.70, "size":3221225472, "time":7480.40, "target":"throughput"}
+{"name":"allreduce", "kernel":2, "ranks":16,"ranksPerNode":8,  "algBw":0.54,  "busBw":1.01,   "size":8192,       "time":15.10,   "target":"latency"}
+{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8,  "algBw":201.46,"busBw":377.74, "size":3221225472, "time":15989.38,"target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":118.49,"busBw":222.17, "size":25165824,   "time":212.39,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":138.48,"busBw":259.65, "size":50331648,   "time":363.40,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8,  "algBw":166.72,"busBw":312.60, "size":3221225472, "time":19321.02,"target":"throughput"}
+{"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8,  "algBw":96.94, "busBw":90.88,  "size":1073741824, "time":11076.24,"target":"throughput"}
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
@@ -1,83 +1,99 @@
 set -e
 HOSTFILE=/root/mscclpp/test/deploy/hostfile_mpi
+HEAD_HOST=$(head -1 ${HOSTFILE})
+# Resolve HEAD_HOST to an IP address on eth0 to ensure bootstrap uses the correct interface
+HEAD_IP=$(ssh -o StrictHostKeyChecking=no -p 22345 -i /root/mscclpp/sshkey ${HEAD_HOST} "ip -4 addr show eth0 | grep -oP 'inet \K[0-9.]+' | head -1" 2>/dev/null)
+if [ -z "${HEAD_IP}" ]; then
+    HEAD_IP=${HEAD_HOST}
+fi
+MPI_ARGS="--allow-run-as-root --bind-to numa -hostfile ${HOSTFILE} -mca btl_tcp_if_include eth0"
+MSCCLPP_ENV="-x MSCCLPP_DEBUG=WARN -x MSCCLPP_SOCKET_IFNAME=eth0 -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:\$LD_LIBRARY_PATH"
+
+# Select perf baseline based on GPU type
+GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader -i 0 2>/dev/null | head -1)
+if echo "${GPU_NAME}" | grep -qi "H100"; then
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv5.jsonl
+else
+    PERF_BASELINE=/root/mscclpp/test/deploy/perf_ndmv4.jsonl
+fi
 
 function run_mscclpp_test()
 {
   echo "=================Run allgather_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   # For kernel 2, the message size must can be divided by 3
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
   echo "==================Run allreduce_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 1K -e 1M -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/allreduce_test_perf -b 3K -e 3G -f 2 -k 4 -o /root/mscclpp/output.jsonl
 
   echo "==================Run alltoall_test_perf on 2 nodes========================="
-  mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile ${HOSTFILE} \
-    -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+  mpirun ${MPI_ARGS} -np 16 \
+    ${MSCCLPP_ENV} \
+    -npernode 8 /root/mscclpp/build/bin/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   echo "========================Run performance check==============================="
   python3 /root/mscclpp/test/mscclpp-test/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
-    --baseline-file /root/mscclpp/test/deploy/perf_ndmv4.jsonl
+    --baseline-file ${PERF_BASELINE}
 }
 
 function run_mp_ut()
 {
   echo "============Run multi-process unit tests on 2 nodes (np=2, npernode=1)========================="
-  mpirun -allow-run-as-root -tag-output -np 2 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 1 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 2 \
+  ${MSCCLPP_ENV} \
+  -npernode 1 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 
   echo "============Run multi-process unit tests on 2 nodes (np=16, npernode=8)========================="
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -npernode 8 /root/mscclpp/build/test/mp_unit_tests -ip_port mscclit-000000:20003
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
+  -npernode 8 /root/mscclpp/build/bin/mp_unit_tests -ip_port ${HEAD_IP}:20003
 }
 
 function run_pytests()
 {
   echo "==================Run python tests================================"
-  mpirun -allow-run-as-root -tag-output -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
+  mpirun ${MPI_ARGS} -tag-output -np 16 \
+  ${MSCCLPP_ENV} \
   -x MSCCLPP_HOME=/root/mscclpp -npernode 8 bash /root/mscclpp/test/deploy/pytest.sh
 }
 
 function run_py_benchmark()
 {
   echo "==================Run python benchmark================================"
-  mpirun -allow-run-as-root -np 16 --bind-to numa \
-  -hostfile ${HOSTFILE} -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build/lib:$LD_LIBRARY_PATH \
-  -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
+  mpirun ${MPI_ARGS} -np 16 \
+  ${MSCCLPP_ENV} \
+  -mca pml ob1 -mca btl ^openib -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
   -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
   -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
+  -x MSCCLPP_HOME=/root/mscclpp -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
 }
 
 if [ $# -lt 1 ]; then