microsoft
diff --git a/‎.azure-pipelines/templates/ut-executor.yml‎
Lines changed: 42 additions & 0 deletions b/‎.azure-pipelines/templates/ut-executor.yml‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎.azure-pipelines/ut.yml‎
Lines changed: 21 additions & 0 deletions b/‎.azure-pipelines/ut.yml‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎test/executor-tests/algos/reduce.py‎
Lines changed: 87 additions & 0 deletions b/‎test/executor-tests/algos/reduce.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎test/executor-tests/algos/reduce_nvls.py‎
Lines changed: 91 additions & 0 deletions b/‎test/executor-tests/algos/reduce_nvls.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎test/executor-tests/algos/reduce_nvls_pipeline.py‎
Lines changed: 94 additions & 0 deletions b/‎test/executor-tests/algos/reduce_nvls_pipeline.py‎
Lines changed: 94 additions & 0 deletions
@@ -0,0 +1,42 @@
+parameters:
+- name: subscription
+  type: string
+- name: vmssName
+  type: string
+- name: platform
+  type: string
+  default: 'cuda'
+- name: gpuArch
+  type: string
+
+steps:
+- template: deploy.yml
+  parameters:
+    subscription:     ${{ parameters.subscription }}
+    vmssName:         ${{ parameters.vmssName }}
+    platform:         ${{ parameters.platform }}
+    gpuArch:          ${{ parameters.gpuArch }}
+    deployArgs:       'single-node-test true ${{ parameters.platform }}'
+
+
+- template: run-remote-task.yml
+  parameters:
+    name: ExecutorTest
+    displayName: Run executor tests
+    remoteScript: |
+      python3 -m pip install .
+      PLANS_DIR=/root/mscclpp/test/executor-tests/execution-plans
+      TEST_SCRIPT=/root/mscclpp/python/test/executor_test.py
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/transfer_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_pack_tbg.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls.json --size 32M --in_place
+      mpirun -np 2 --allow-run-as-root python3 $TEST_SCRIPT -path $PLANS_DIR/reduce_nvls_pipeline.json --size 32M --in_place
+
+- template: stop.yml
+  parameters:
+    subscription: ${{ parameters.subscription }}
+    vmssName:     ${{ parameters.vmssName }}
@@ -148,3 +148,24 @@ jobs:
       vmssName:         mscclpp-mi300x-ci
       platform:         rocm
       gpuArch:          gfx942
+
+- job: UnitTestExecutor
+  timeoutInMinutes: 60
+  displayName: Test DSL Executor
+  pool:
+    name: msccl-ci-h100
+
+  strategy:
+    matrix:
+      cuda12:
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.9
+
+  container:
+    image: $(containerImage)
+
+  steps:
+  - template: templates/ut-executor.yml
+    parameters:
+      subscription:     mscclpp-ci-h100
+      vmssName:         mscclpp-h100-ci
+      gpuArch:          '90'
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce Test
+
+This file tests the PUT, GET, COPY, REDUCE_SEND and READ_REDUCE_SEND
+operations. It implements a 2-GPU allreduce using the Simple protocol
+with instruction fusion enabled.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce(name, num_threads_per_block, min_message_size, max_message_size):
+    collective = AllReduce(2, 2, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        2,
+        protocol="Simple",
+        instr_fusion=True,
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Setup ranks, channels, input and scratch buffers for 2-GPU allreduce
+        first_rank = Rank(0)
+        second_rank = Rank(1)
+        first_ch = MemoryChannel(1, 0)
+        second_ch = MemoryChannel(0, 1)
+        first_input_buffer = first_rank.get_input_buffer()
+        second_input_buffer = second_rank.get_input_buffer()
+        first_scratch_buffer = Buffer(0, 4)
+        second_scratch_buffer = Buffer(1, 4)
+
+        # Each rank copies its input chunks to scratch to prepare for remote access
+        first_rank.copy(first_scratch_buffer[2:4], first_input_buffer[2:4], tb=0)
+        second_rank.copy(second_scratch_buffer[0:2], second_input_buffer[0:2], tb=0)
+
+        # Signal and wait to ensure scratch data is visible to the remote rank
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        # Rank 0 reduces chunk 0 from rank 1's scratch and writes result to both ranks
+        first_ch.reduce(first_input_buffer[0:1], [second_scratch_buffer[0:1]], tb=0)
+        first_ch.put(second_input_buffer[0:1], first_input_buffer[0:1], tb=0)
+
+        # Rank 0 fetches chunk 1 from rank 1's scratch, reduces locally, and writes result to both ranks
+        first_ch.get(first_scratch_buffer[1:2], second_scratch_buffer[1:2], tb=0)
+        first_rank.reduce(first_input_buffer[1:2], [first_scratch_buffer[1:2]], tb=0)
+        first_ch.put(second_input_buffer[1:2], first_input_buffer[1:2], tb=0)
+
+        # Rank 1 reduces chunks 2-3 from rank 0's input, copies to scratch, and writes result to both ranks
+        second_ch.reduce(second_input_buffer[2:4], [first_input_buffer[2:4]], tb=0)
+        second_rank.copy(second_scratch_buffer[2:4], second_input_buffer[2:4], tb=0)
+        second_ch.put(first_input_buffer[2:4], second_scratch_buffer[2:4], tb=0)
+
+        # Final signal/wait to ensure all reduced data is consistent across both ranks
+        first_ch.signal(tb=0)
+        second_ch.signal(tb=0)
+
+        first_ch.wait(tb=0)
+        second_ch.wait(tb=0)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce(args.name, args.num_threads_per_block, args.min_message_size, args.max_message_size)
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation using
+NVLS SwitchChannels. Each GPU reduces its chunk via the
+NVSwitch and broadcasts the result to all other GPUs.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def reduce_nvls(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
@@ -0,0 +1,94 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""
+Reduce NVLS Pipeline Test
+
+This file tests the executor MULTI_LOAD_REDUCE_STORE operation in a
+pipeline context using SwitchChannel. Each GPU reduces
+its chunk via the NVSwitch and broadcasts the result, processing data
+in a pipelined loop over fixed-size iterations.
+"""
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+from mscclpp.language.loop import LoopIterationContext
+
+
+def reduce_nvls_pipeline(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllReduce(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        instances=1,
+        protocol="Simple",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=False,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the GPUs are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+
+        # Pipeline Reducing and Storing the data
+        with LoopIterationContext(unit=2**20, num_chunks=1):
+            for gpu in range(gpu_size):
+                buffer_offset = gpu
+                rank = Rank(gpu)
+                input_buffer = rank.get_input_buffer()
+                nvls_chan.at_rank(gpu).reduce(
+                    buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+                )
+                nvls_chan.at_rank(gpu).broadcast(
+                    src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+                )
+
+        # Synchronization to Ensure the GPUs finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+reduce_nvls_pipeline(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)