ROCm · gilbertlee-amd · Apr 10, 2026 · Apr 11, 2026 · Apr 11, 2026
@@ -14,16 +14,20 @@ Documentation for TransferBench is available at
 - Adding NIC_CQ_POLL_BATCH to control CQ poll batch size for NIC transfers
 - New "hbm" preset which sweeps and tests local HBM read performance
 - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug)
+- Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1
+- Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs
 
 ### Modified
-  - DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
-  - Adding extra information to CMake and make build methods to indicate enabled / disabled features
-  - a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
-  - a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
-  - scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
-  - NIC_FILTER renamed to TB_NIC_FILTER for consistency
-  - DUMP_LINES renamed to TB_DUMP_LINES for consistency
-  - Dynamically size CQs for NIC transfers in high QPs case
+- DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options
+- Adding extra information to CMake and make build methods to indicate enabled / disabled features
+- a2asweep preset changes from USE_FINE_GRAIN to MEM_TYPE to reflect various memory types
+- a2asweep preset changes from NUM_CUS to NUM_SUB_EXECS to match with a2a preset naming convention
+- scaling preset changes from using USE_FINE_GRAIN to CPU_MEM_TYPE and GPU_MEM_TYPE
+- NIC_FILTER renamed to TB_NIC_FILTER for consistency
+- DUMP_LINES renamed to TB_DUMP_LINES for consistency
+- Dynamically size CQs for NIC transfers in high QPs case
+- Switch to using hipMemcpyDeviceToDeviceNoCU instead of hipMemcpyDefault for DMA Executor if available (requires HIP >= 6.0)
+- Allow for multiple destination memory locations for DMA/Batched-DMA Transfers
 
 ## v1.66.02
 ### Added

@@ -8,12 +8,13 @@
 #                SRC 1 -> Executor -> DST 1
 #                SRC X                DST Y
 
-# Three Executors are supported by TransferBench
+# Five Executors are supported by TransferBench
 #   Executor:        SubExecutor:
 #   1) CPU           CPU thread
 #   2) GPU           GPU threadblock/Compute Unit (CU)
-#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+#   3) DMA           N/A.                                 (Must have single SRC, at least one DST)
 #   4) NIC           Queue Pair
+#   5) Batched-DMA   Batch item                           (Must have single SRC, at least one DST)
 
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
 
@@ -38,6 +39,7 @@
 #                 - C:    CPU-executed          (Indexed from 0 to # NUMA nodes - 1)
 #                 - G:    GPU-executed          (Indexed from 0 to # GPUs - 1)
 #                 - D:    DMA-executor          (Indexed from 0 to # GPUs - 1)
+#                 - B:    Batched-DMA-executor  (Indexed from 0 to # GPUs - 1)
 #                 - I#.#: NIC executor          (Indexed from 0 to # NICs - 1)
 #                 - N#.#: Nearest NIC executor  (Indexed from 0 to # GPUs - 1)
 #   dstMemL   :   Destination memory locations (Where the data is to be written to)

@@ -0,0 +1,182 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+int BmaSweepPreset(EnvVars&          ev,
+                   size_t      const numBytesPerTransfer,
+                   std::string const presetName,
+                   bool        const bytesSpecified)
+{
+  if (TransferBench::GetNumRanks() > 1) {
+    Utils::Print("[ERROR] BMA sweep preset currently not supported for multi-node\n");
+    return 1;
+  }
+
+#ifndef BMA_EXEC_ENABLED
+  Utils::Print("[ERROR] BMA executor requires ROCm 7.1 or newer\n");
+  return 1;
+#endif
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX"         ,               0);
+  int         localCopy     = EnvVars::GetEnvVar("LOCAL_COPY"        ,               0);
+  vector<int> gfxSesList    = EnvVars::GetEnvVarArray("GFX_SUB_EXECS",              {});
+  int         gpuMemTypeIdx = EnvVars::GetEnvVar("GPU_MEM_TYPE"      ,               0);
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES"   , numDetectedGpus);
+  vector<int> bmaSesList    = EnvVars::GetEnvVarArray("NUM_SUB_EXECS",       {1,2,4,8});
+
+
+  MemType gpuMemType = Utils::GetGpuMemType(gpuMemTypeIdx);
+
+  // Display environment variables
+  if (Utils::RankDoesOutput()) {
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+      int outputToCsv = ev.outputToCsv;
+      if (!outputToCsv) printf("[BMA Sweep Related]\n");
+      ev.Print("EXE_INDEX"      , exeIndex,          "Executing on GPU %d", exeIndex);
+      ev.Print("LOCAL_COPY"     , localCopy,         "%s local copy to GPU %d", localCopy ? "Including" : "Excluding", exeIndex);
+      ev.Print("GFX_SUB_EXECS"  , gfxSesList.size(), EnvVars::ToStr(gfxSesList).c_str());
+      ev.Print("GPU_MEM_TYPE"   , gpuMemTypeIdx,     "Using %s (%s)", Utils::GetGpuMemTypeStr(gpuMemTypeIdx).c_str(), Utils::GetAllGpuMemTypeStr().c_str());
+      ev.Print("NUM_GPU_DEVICES", numGpuDevices,     "Using %d GPUs", numGpuDevices);
+      ev.Print("NUM_SUB_EXECS"  , bmaSesList.size(), EnvVars::ToStr(bmaSesList).c_str());
+      printf("\n");
+    }
+  }
+
+  if (exeIndex < 0 || exeIndex >= numGpuDevices) {
+    Utils::Print("EXE_INDEX must be between 0 and %d inclusively\n", numGpuDevices - 1);
+    return 1;
+  }
+
+  int numTransfers  = numGpuDevices - 1 + (localCopy ? 1 : 0);
+  int numBmaSubExec = (int)bmaSesList.size();
+  int numGfxSubExec = (int)gfxSesList.size();
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Prepare table of results
+  int minPow2Exp = 12;
+  int maxPow2Exp = 30;
+  int numRows    = 1 + (bytesSpecified ? 1 : (maxPow2Exp - minPow2Exp + 1));
+  int numCols    = 2 + numBmaSubExec + numGfxSubExec;
+
+  Utils::TableHelper table(numRows, numCols);
+  Utils::Print("Performing %d simultaneous DMA Transfers from GPU %d to other GPUs\n", numTransfers, exeIndex);
+
+  // Prepare headers
+
+  table.Set(0, 0, " Bytes ");
+  table.Set(0, 1, " DMA ");
+  for (int i = 0; i < numBmaSubExec; i++) {
+    table.Set(0, 2+i, " BMA(%02d) ", bmaSesList[i]);
+  }
+  for (int i = 0; i < numGfxSubExec; i++) {
+    table.Set(0, 2+numBmaSubExec+i, " GFX(%02d) ", gfxSesList[i]);
+  }
+
+  table.DrawRowBorder(0);
+  table.DrawRowBorder(1);
+  table.DrawRowBorder(numRows);
+  table.DrawColBorder(0);
+  table.DrawColBorder(1);
+  table.DrawColBorder(2);
+  table.DrawColBorder(2+numBmaSubExec);
+  table.DrawColBorder(numCols);
+
+  if (!ev.outputToCsv){
+    Utils::Print("Executing: ");
+    fflush(stdout);
+  };
+
+  for (size_t numBytes = 1ULL<<minPow2Exp, currRow=1; numBytes <= (1ULL<<maxPow2Exp); numBytes<<=1, currRow++) {
+    if (bytesSpecified) numBytes = numBytesPerTransfer;
+
+    if (!ev.outputToCsv) {
+      Utils::Print(".");
+      fflush(stdout);
+    }
+
+    table.Set(currRow, 0, " %lu ", numBytes);
+    std::vector<Transfer> transfers(1);
+
+    Transfer& t = transfers[0];
+    t.numBytes = numBytes;
+    t.srcs     = {{gpuMemType, exeIndex}};
+    t.dsts.clear();
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex && localCopy == 0) continue;
+      t.dsts.push_back({gpuMemType, i});
+    }
+
+    // DMA executor first
+    t.exeDevice = {EXE_GPU_DMA, exeIndex};
+    t.numSubExecs = 1;
+
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (auto const& err : results.errResults)
+        Utils::Print("%s\n", err.errMsg.c_str());
+      return 1;
+    }
+
+    table.Set(currRow, 1, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+
+    // BMA executor next
+    t.exeDevice = {EXE_GPU_BDMA, exeIndex};
+    for (int i = 0; i < numBmaSubExec; i++) {
+      t.numSubExecs = bmaSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+
+      table.Set(currRow, 2+i, " %6.2f ", numTransfers * results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+
+    // GFX executor last
+    t.exeDevice = {EXE_GPU_GFX, exeIndex};
+    for (int i = 0; i < numGfxSubExec; i++) {
+      t.numSubExecs = gfxSesList[i];
+
+      if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults)
+          Utils::Print("%s\n", err.errMsg.c_str());
+        return 1;
+      }
+
+      table.Set(currRow, 2+numBmaSubExec+i, " %6.2f ", results.tfrResults[0].avgBandwidthGbPerSec);
+    }
+    if (bytesSpecified) break;
+  }
+
+  if (!ev.outputToCsv) {
+    Utils::Print("\n");
+  }
+  table.PrintTable(ev.outputToCsv, ev.showBorders);
+  Utils::Print("Reported numbers are all GB/s, normalized for per Transfer for %d Transfers\n", numTransfers);
+
+  return 0;
+}
@@ -30,6 +30,7 @@ THE SOFTWARE.
 #include "AllToAll.hpp"
 #include "AllToAllN.hpp"
 #include "AllToAllSweep.hpp"
+#include "BmaSweep.hpp"
 #include "GfxSweep.hpp"
 #include "HbmBandwidth.hpp"
 #include "HealthCheck.hpp"
@@ -53,7 +54,8 @@ std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
   {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
   {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
   {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
-  {"gfxsweep",    {GfxSweepPreset,      "Sweep BLOCKSIZES, UNROLLS, and NUM_SUB_EXECS for one GFX transfer (GFX_SWEEP_TRANSFER)"}},
+  {"bmasweep",    {BmaSweepPreset,      "Test and compare batched DMA executor for multi destination copies"}},
+  {"gfxsweep",    {GfxSweepPreset,      "Sweep over various GFX kernel options for a given GFX Transfer"}},
   {"hbm",         {HbmBandwidthPreset,  "Tests HBM bandwidth"}},
   {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
   {"nicrings",    {NicRingsPreset,      "Tests NIC rings created across identical NIC indices across ranks"}},

@@ -393,12 +393,13 @@ namespace TransferBench::Utils
   std::string ExeTypeToStr(ExeType exeType)
   {
     switch (exeType) {
-    case EXE_CPU:         return "CPU";
-    case EXE_GPU_GFX:     return "GPU";
-    case EXE_GPU_DMA:     return "DMA";
-    case EXE_NIC:         return "NIC";
-    case EXE_NIC_NEAREST: return "NIC";
-    default:              return "N/A";
+    case EXE_CPU:           return "CPU";
+    case EXE_GPU_GFX:       return "GPU";
+    case EXE_GPU_DMA:       return "DMA";
+    case EXE_NIC:           return "NIC";
+    case EXE_NIC_NEAREST:   return "NIC";
+    case EXE_GPU_BDMA:      return "BMA";
+    default:                return "N/A";
     }
   }